Author: siren Date: Sun Jun 4 12:43:47 2006 New Revision: 411593 URL: http://svn.apache.org/viewvc?rev=411593&view=rev Log: initial import of web-keymatch plugin
Added: lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/README.txt lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/build.xml lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/keymatches.xml lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/lib/ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/plugin.xml lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/conf/ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/conf/tiles-defs.xml lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/AbstractFilter.java lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/CountFilter.java lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/DomUtil.java lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/KeyMatch.java lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/KeyMatchFilter.java lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/SimpleKeyMatcher.java lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/ViewCountSorter.java lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/package.html lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/webapp/ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/webapp/controller/ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/webapp/controller/KeyMatchController.java lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/resources/ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/test/ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/test/org/ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/test/org/apache/ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/test/org/apache/nutch/ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/test/org/apache/nutch/keymatch/ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/test/org/apache/nutch/keymatch/TestSimpleKeyMatcher.java lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/test/org/apache/nutch/keymatch/TestViewCountSorter.java lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/web/ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/web/web-keymatch/ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/web/web-keymatch/keymatch.jsp Added: lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/README.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/README.txt?rev=411593&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/README.txt (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/README.txt Sun Jun 4 12:43:47 2006 @@ -0,0 +1,9 @@ +Instructions + +0. see general instructions from web2 README.txt +1. Copy your keymatches.xml to nutch config directory. +2. enable web-keymatches +3. insert <tiles:insert name="keymatch"/> into web page to enable +functionality +4. ant war +5. deploy war Added: lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/build.xml?rev=411593&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/build.xml (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/build.xml Sun Jun 4 12:43:47 2006 @@ -0,0 +1,19 @@ +<?xml version="1.0"?> +<project name="web-keymatch" default="jar-core"> + <import file="../build-plugin.xml" /> + <property name="nutch.root" location="${root}/../../../../" /> + <target name="init-plugin"> + <echo>Copying resources templates</echo> + <copy todir="${build.classes}/resources"> + <fileset dir="${resources.dir}" includes="**/*" /> + </copy> + <echo>Copying UI configuration</echo> + <copy todir="${build.classes}"> + <fileset dir="src/conf" includes="**/*"/> + </copy> + <echo>Copying UI templates</echo> + <copy todir="${deploy.dir}/web"> + <fileset dir="src/web" includes="**/*"/> + </copy> + </target> +</project> Added: lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/keymatches.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/keymatches.xml?rev=411593&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/keymatches.xml (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/keymatches.xml Sun Jun 4 12:43:47 2006 @@ -0,0 +1,23 @@ +<?xml version="1.0"?> +<keymatches> + <keymatch type="keyword"> + <term>mapred</term> + <url>http://lucene.apache.org/hafoop/</url> + <title>Try Hadoop today!</title> + </keymatch> + <keymatch type="phrase"> + <term>search engine</term> + <url>http://lucene.apache.org/nutch/</url> + <title>Try nutch!</title> + </keymatch> + <keymatch type="exact"> + <term>apache search engine</term> + <url>http://lucene.apache.org/nutch/</url> + <title>Try apache nutch!</title> + </keymatch> + <keymatch type="exact"> + <term>kw1 kw2 kw3 kw4</term> + <url>url/</url> + <title>title</title> + </keymatch> +</keymatches> \ No newline at end of file Added: lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/plugin.xml?rev=411593&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/plugin.xml (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/plugin.xml Sun Jun 4 12:43:47 2006 @@ -0,0 +1,25 @@ +<?xml version="1.0" encoding="UTF-8"?> +<plugin + id="web-keymatch" + name="KeyMatcher for promoting urls" + version="1.0.0" + provider-name="apache.org"> + + <runtime> + <library name="web-keymatch.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.webapp.extension.UIExtensionPoint" + name="Nutch ui extension point" + point="org.apache.nutch.webapp.extension.UIExtensionPoint"> + <implementation id="web-keymatch" + class="org.apache.nutch.webapp.extension.UIExtension.VoidImplementation"/> + </extension> + +</plugin> Added: lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/conf/tiles-defs.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/conf/tiles-defs.xml?rev=411593&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/conf/tiles-defs.xml (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/conf/tiles-defs.xml Sun Jun 4 12:43:47 2006 @@ -0,0 +1,9 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE tiles-definitions PUBLIC "-//Apache Software Foundation//DTD Tiles Configuration 1.1//EN" + "http://struts.apache.org/dtds/tiles-config_1_1.dtd"> +<tiles-definitions> + <definition name="keymatch" extends="decoratedDefinition" controllerClass="org.apache.nutch.webapp.controller.KeyMatchController"> + <put name="name" type="string">keymatch</put> + <put name="decorator" type="string" value="/plugin/web-keymatch/keymatch.jsp"/> + </definition> +</tiles-definitions> \ No newline at end of file Added: lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/AbstractFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/AbstractFilter.java?rev=411593&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/AbstractFilter.java (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/AbstractFilter.java Sun Jun 4 12:43:47 2006 @@ -0,0 +1,49 @@ +/* + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.keymatch; + +import java.util.List; +import java.util.Map; + +public abstract class AbstractFilter implements KeyMatchFilter { + + KeyMatchFilter next=null; + + /* (non-Javadoc) + * @see org.apache.nutch.keymatch.IKeyMatchFilter#setNext(org.apache.nutch.keymatch.IKeyMatchFilter) + */ + public void setNext(KeyMatchFilter next){ + this.next=next; + } + + + /* (non-Javadoc) + * @see org.apache.nutch.keymatch.IKeyMatchFilter#filter(java.util.List, java.util.Map) + */ + public KeyMatch[] filter(List matches, Map context) { + if (next==null) { + + //update view counts + for(int i=0;i<matches.size();i++){ + ((KeyMatch)matches.get(i)).viewCount++; + } + + return (KeyMatch[])matches.toArray(new KeyMatch[matches.size()]); + } else { + return next.filter(matches, context); + } + } +} Added: lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/CountFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/CountFilter.java?rev=411593&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/CountFilter.java (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/CountFilter.java Sun Jun 4 12:43:47 2006 @@ -0,0 +1,51 @@ +/* + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.keymatch; + +import java.util.List; +import java.util.Map; + +/** + * <p>Implementation of KeyMatchFilter that simply + * crops the count of matches to defined level or + * by default of 3.</p> + * + * <p>The number of results returned is controlled + * with context parameter under key "count"</p> + * + * @author Sami Siren + */ +public class CountFilter extends AbstractFilter { + + public static final String KEY_COUNT="count"; + public static final int DEFAULT_COUNT=3; + + public KeyMatch[] filter(List matches, Map context) { + int count=DEFAULT_COUNT; + + try{ + count=Integer.parseInt((String)context.get(KEY_COUNT)); + } catch (Exception e){ + //ignore + } + + if(matches.size()>count) { + return super.filter(matches.subList(0,count), context); + } else { + return super.filter(matches, context); + } + } +} Added: lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/DomUtil.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/DomUtil.java?rev=411593&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/DomUtil.java (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/DomUtil.java Sun Jun 4 12:43:47 2006 @@ -0,0 +1,97 @@ +/* + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.keymatch; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.UnsupportedEncodingException; + +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + +import org.apache.xerces.parsers.DOMParser; +import org.w3c.dom.Element; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +public class DomUtil { + + /** + * Returns parsed dom tree or null if any error + * + * @param is + * @return + */ + public static Element getDom(InputStream is) { + + Element element = null; + + DOMParser parser = new DOMParser(); + + InputSource input; + try { + input = new InputSource(is); + input.setEncoding("UTF-8"); + parser.parse(input); + element = (Element) parser.getDocument().getChildNodes().item(0); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (SAXException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + return element; + } + + /** + * save dom into ouputstream + * + * @param os + * @param e + */ + public static void saveDom(OutputStream os, Element e) { + + DOMSource source = new DOMSource(e); + TransformerFactory transFactory = TransformerFactory.newInstance(); + Transformer transformer; + try { + transformer = transFactory.newTransformer(); + transformer.setOutputProperty("indent", "yes"); + StreamResult result = new StreamResult(os); + transformer.transform(source, result); + os.flush(); + } catch (UnsupportedEncodingException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + } catch (IOException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + } catch (TransformerConfigurationException e2) { + // TODO Auto-generated catch block + e2.printStackTrace(); + } catch (TransformerException ex) { + // TODO Auto-generated catch block + ex.printStackTrace(); + } + } +} Added: lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/KeyMatch.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/KeyMatch.java?rev=411593&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/KeyMatch.java (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/KeyMatch.java Sun Jun 4 12:43:47 2006 @@ -0,0 +1,179 @@ +/* + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.keymatch; + +import org.apache.xerces.util.DOMUtil; +import org.w3c.dom.Element; + +public class KeyMatch { + + public static final String TAG_TERM = "term"; + + public static final String TAG_URL = "url"; + + public static final String TAG_TITLE = "title"; + + public static final String ATTR_TYPE = "type"; + + public static final String TYPES[] = { "keyword", "phrase", "exact" }; + + public static final int TYPE_KEYWORD=0; + public static final int TYPE_PHRASE=1; + public static final int TYPE_EXACT=2; + + static int counter = 0; + + String term; + String url; + String title; + int type; + int viewCount=0; + + transient String identifier; + + public KeyMatch() { + // generate unique id + this.identifier = "m-" + counter++; + } + + public KeyMatch(String terms, String url, String title, int type) { + this(); + this.term = terms; + this.url = url; + this.title = title; + + if (type > TYPES.length) { + this.type = 0; + } else { + this.type = type; + } + + } + + /** + * Initialize object from Element + * + * @param element + */ + public void initialize(final Element element) { + try { + term = DOMUtil.getChildText( + element.getElementsByTagName(TAG_TERM).item(0)).trim(); + + String stype = element.getAttribute(ATTR_TYPE); + for (int i = 0; i < TYPES.length; i++) { + if (TYPES[i].equals(stype)) { + type = i; + } + } + + url = DOMUtil.getChildText(element.getElementsByTagName(TAG_URL).item(0)) + .trim(); + title = DOMUtil.getChildText( + element.getElementsByTagName(TAG_TITLE).item(0)).trim(); + } catch (Exception ex) { + // ignore + } + } + + /** + * Fill in element with data from this object + * + * @param element + */ + public void populateElement(final Element element) { + final Element term = element.getOwnerDocument().createElement(TAG_TERM); + term.setNodeValue(this.term); + element.appendChild(term); + element.setAttribute(ATTR_TYPE, TYPES[type]); + final Element url = element.getOwnerDocument().createElement(TAG_URL); + url.setNodeValue(this.url); + element.appendChild(url); + final Element title = element.getOwnerDocument().createElement(TAG_TITLE); + title.setNodeValue(this.title); + element.appendChild(title); + } + + /** + * @return Returns the term. + */ + public String getTerm() { + return term; + } + + /** + * @param term + * The term to set. + */ + public void setTerm(final String term) { + this.term = term; + } + + /** + * @return Returns the title. + */ + public String getTitle() { + return title; + } + + /** + * @param title + * The title to set. + */ + public void setTitle(final String title) { + this.title = title; + } + + /** + * @return Returns the url. + */ + public String getUrl() { + return url; + } + + /** + * @param url + * The url to set. + */ + public void setUrl(final String url) { + this.url = url; + } + + /* (non-Javadoc) + * @see java.lang.Object#equals(java.lang.Object) + */ + public boolean equals(Object obj) { + if(obj instanceof KeyMatch) { + KeyMatch other=(KeyMatch) obj; + return (other.type==type && other.term.equals(term) && other.title.equals(title) && other.url.equals(url)); + } else + return super.equals(obj); + } + + /** + * @return Returns the type. + */ + public int getType() { + return type; + } + + /** + * @param type The type to set. + */ + public void setType(int type) { + this.type = type; + } +} Added: lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/KeyMatchFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/KeyMatchFilter.java?rev=411593&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/KeyMatchFilter.java (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/KeyMatchFilter.java Sun Jun 4 12:43:47 2006 @@ -0,0 +1,42 @@ +/* + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.keymatch; + +import java.util.List; +import java.util.Map; + +/** + * <p>All implementing classes should extend AbstractFilter + * </p> + */ +public interface KeyMatchFilter { + + /** + * Do filtering for matches + * @param matches current List of matches + * @param context the evaluation context + * @return + */ + public KeyMatch[] filter(List matches, Map context); + + /** + * <p>Set the next filter that is processed after this + * one</p> + * @param filter the filter to set + */ + public void setNext(KeyMatchFilter filter); + +} Added: lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/SimpleKeyMatcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/SimpleKeyMatcher.java?rev=411593&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/SimpleKeyMatcher.java (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/SimpleKeyMatcher.java Sun Jun 4 12:43:47 2006 @@ -0,0 +1,339 @@ +/* + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.keymatch; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.List; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; +import java.util.Iterator; +import java.util.logging.Logger; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.util.LogFormatter; +import org.apache.nutch.searcher.Query; +import org.apache.xerces.dom.DocumentImpl; +import org.w3c.dom.Element; +import org.w3c.dom.NodeList; + +/** + * <p>SimpleKeyMatcher is responsible for targetting predefined links for defined + * keywords for example to promote some urls that are not yet part of + * production index.</p> + * <p>SimpleKeyMatcher is not a textadd targetting system</p> + * <p>KeyMatcher is configured with xml configuration file: + * <br><pre> + * <?xml version="1.0"?> + * <keymatches> + * <keymatch type="keyword|phrase|exact"> + * <term>search engine</term> + * <url>http://lucene.apache.org/nutch</url> + * <title>Your favourite search engine!</title> + * </keymatch> + * </keymatches></pre> + * By default Keymatcher expects the file be named keymatches.xml + * </p> + * <p>Match type can be one of the following: keyword, phrase, exact match. + * Terms of a query are produced by the Query object and none of the + * matches is case sensitive</p> + * <b>keyword</b><br> + * Matches on keyword level, for example query "search engine" would match both + * keywords search and engine<br> + * <br> + * <b>phrase</b><br> + * Matches phrase, for example: query "open source search engine" "search engine watch" + * would match "search engine", but query "search from engine" would not.<br> + * <br> + * <b>exact</b><br> + * Query "open source search engine" would match "open source search engine", but not + * "search engine" nor "best open source engine"<br> + * + */ +public class SimpleKeyMatcher extends Configured { + + static final char PREFIX_KEYWORD='k'; + static final char PREFIX_PHRASE='p'; + static final char PREFIX_EXACT='e'; + + class KeyMatcherStats { + int terms[]; + + void addStats(int numTerms) { + if (numTerms <= terms.length) { + terms[numTerms]++; + } + } + + public KeyMatcherStats(int size) { + terms = new int[size]; + for (int i = 0; i < size; i++) { + terms[i] = 0; + } + } + } + + static final Logger LOG = LogFormatter.getLogger(SimpleKeyMatcher.class + .getName()); + + public static final String TAG_KEYMATCH = "keymatch"; + + public static final String TAG_KEYMATCHES = "keymatches"; + + static final String DEFAULT_CONFIG_FILE = "keymatches.xml"; + + static final int MAX_TERMS = 5; + + KeyMatcherStats stats; + KeyMatchFilter currentFilter; + + HashMap matches = new HashMap(); + private String configName; + + public SimpleKeyMatcher(Configuration conf) { + this(DEFAULT_CONFIG_FILE,conf); + } + + /** + * Sets currentFilter + * @param filter the filter to set + */ + public void setFilter(KeyMatchFilter filter) { + this.currentFilter=filter; + } + + /** + * Construct new SimpleKeyMatcher with provided filename and configuration + * @param resourceName + * @param conf + */ + public SimpleKeyMatcher(String resourceName, Configuration conf) { + super(conf); + configName=resourceName; + stats = new KeyMatcherStats(MAX_TERMS); + currentFilter=new ViewCountSorter(); + init(); + } + + /** + * Initialize keyword matcher + * + */ + protected void init() { + final HashMap tempMap = new HashMap(); + final InputStream input = getConf().getConfResourceAsInputStream( + configName); + + if (input != null) { + final Element root = DomUtil.getDom(input); + try { + input.close(); + } catch (IOException e1) { + e1.printStackTrace(); + } + + final NodeList nodeList = root.getElementsByTagName(TAG_KEYMATCH); + + LOG.fine("Configuration file has " + nodeList.getLength() + + " KeyMatch entries."); + for (int i = 0; i < nodeList.getLength(); i++) { + final Element element = (Element) nodeList.item(i); + final KeyMatch keyMatch = new KeyMatch(); + keyMatch.initialize(element); + addKeyMatch(tempMap, keyMatch); + } + + matches=tempMap; + } + } + + /** + * Get keymatches for query + * @param query parsed query + * @param context evaluation context + * @return array of keymatches + */ + public KeyMatch[] getMatches(final Query query, Map context) { + + final ArrayList currentMatches=new ArrayList(); + + final String terms[]=query.getTerms(); + + //"keyword" + for(int i=0;i<terms.length;i++){ + LOG.fine("keyword: '" + terms[i] + "'"); + + addMatches(currentMatches, matches.get(PREFIX_KEYWORD + terms[i])); + } + + //"phrase" + for(int l=2;l<=terms.length;l++){ + if(stats.terms[l]>0) { + for(int p=0;p<=terms.length-l;p++){ + String key=""; + for(int i=p;i<p+l;i++){ + key+=terms[i]; + if(i!=p+l-1) key+=" "; + } + + LOG.fine("phrase key: '" + key + "'"); + addMatches(currentMatches, matches.get(PREFIX_PHRASE + key)); + } + } + } + + //"exact" + String key=query.toString(); + LOG.fine("exact key: '" + key + "'"); + + addMatches(currentMatches, matches.get(PREFIX_EXACT + key)); + + return currentFilter.filter(currentMatches, context); + } + + void addMatches(ArrayList currentMatches, Object match){ + if(match!=null) { + if(match instanceof ArrayList) { + currentMatches.addAll(((ArrayList)match)); + } else { + currentMatches.add(match); + } + } + } + + /** Get tokens of a string with nutch Query parser + * + * @param string + * @return + */ + private String[] getTokens(final String string){ + org.apache.nutch.searcher.Query q; + try { + q = org.apache.nutch.searcher.Query.parse(string, getConf()); + return q.getTerms(); + } catch (IOException e) { + LOG.info("Error getting terms from query:" + e); + } + return new String[0]; + } + + /** + * add new keymatch + * + * @param keymatch + */ + protected void addKeyMatch(Map map, final KeyMatch keymatch) { + String key=""; + + LOG.info("Adding keymatch: MATCHTYPE=" + KeyMatch.TYPES[keymatch.type] + ", TERM='" + keymatch.term + "', TITLE='" + + keymatch.title + "' ,URL='" + keymatch.url + "'"); + + keymatch.term=keymatch.term.toLowerCase(); + switch (keymatch.type) { + case KeyMatch.TYPE_EXACT: key+=PREFIX_EXACT;break; + case KeyMatch.TYPE_PHRASE: key+=PREFIX_PHRASE;break; + default: key+=PREFIX_KEYWORD;break; + } + + //add info obout kw count for optimization + if(keymatch.type==KeyMatch.TYPE_PHRASE) { + stats.addStats(getTokens(keymatch.term).length); + } + + key+=keymatch.term; + + if(map.containsKey(key)) { + ArrayList l; + + Object o = matches.get(key); + if(o instanceof ArrayList) { + l=(ArrayList) o; + } else { + KeyMatch temp=(KeyMatch)o; + l=new ArrayList(); + l.add(temp); + } + l.add(keymatch); + map.put(key,l); + } else { + map.put(key, keymatch); + } + } + + /** + * Add Keymatch + * + */ + public void addKeyMatch(KeyMatch match){ + addKeyMatch(matches, match); + } + + /** + * Saves keymatch configuration into file. + * + * @throws IOException + */ + public void save() throws IOException { + try { + final URL url = getConf().getResource(configName); + if (url == null) { + throw new IOException("Resource not found: " + configName); + } + final FileOutputStream fos = new FileOutputStream(new File(url.getFile())); + final DocumentImpl doc = new DocumentImpl(); + final Element keymatches = doc.createElement(TAG_KEYMATCHES); + final Iterator iterator = matches.values().iterator(); + + while (iterator.hasNext()) { + final Element keymatch = doc.createElement(TAG_KEYMATCH); + final KeyMatch keyMatch = (KeyMatch) iterator.next(); + keyMatch.populateElement(keymatch); + keymatches.appendChild(keymatch); + } + + DomUtil.saveDom(fos, keymatches); + fos.flush(); + fos.close(); + } catch (FileNotFoundException e) { + throw new IOException(e.toString()); + } + } + + /** + * Clear keymatches from this SimpleKeyMatcher instance + * + */ + public void clear(){ + matches=new HashMap(); + } + + public void setKeyMatches(List keymatches){ + HashMap hm=new HashMap(); + Iterator i=keymatches.iterator(); + while(i.hasNext()) { + KeyMatch km=(KeyMatch)i.next(); + addKeyMatch(hm,km); + } + matches=hm; + } + +} Added: lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/ViewCountSorter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/ViewCountSorter.java?rev=411593&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/ViewCountSorter.java (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/ViewCountSorter.java Sun Jun 4 12:43:47 2006 @@ -0,0 +1,42 @@ +/* + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.keymatch; + +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Map; + +/** + * <p>Implementation of KeyMatchFilter that sorts + * keymatches based on KeyMatch view counts. It allows + * simple rotation of matches (keymatches with lower view + * counts are "prioritized").</p> + */ +public class ViewCountSorter extends AbstractFilter { + + public class ViewCountComparator implements Comparator { + + public int compare(Object o1, Object o2) { + return ((KeyMatch)o1).viewCount-((KeyMatch)o2).viewCount; + } + } + + public KeyMatch[] filter(final List currentMatches, final Map context){ + Collections.sort(currentMatches, new ViewCountSorter.ViewCountComparator()); + return super.filter(currentMatches, context); + } +} Added: lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/package.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/package.html?rev=411593&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/package.html (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/keymatch/package.html Sun Jun 4 12:43:47 2006 @@ -0,0 +1,6 @@ +<html> +<body> +SimpleKeyMatcher is a utility for promoting certain web pages in your +search wich are not yet part of production index or have low score. +</body> +</html> \ No newline at end of file Added: lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/webapp/controller/KeyMatchController.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/webapp/controller/KeyMatchController.java?rev=411593&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/webapp/controller/KeyMatchController.java (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/java/org/apache/nutch/webapp/controller/KeyMatchController.java Sun Jun 4 12:43:47 2006 @@ -0,0 +1,58 @@ +/* + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webapp.controller; + +import java.io.IOException; +import java.util.HashMap; + +import javax.servlet.ServletContext; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.apache.nutch.keymatch.CountFilter; +import org.apache.nutch.keymatch.KeyMatch; +import org.apache.nutch.keymatch.SimpleKeyMatcher; +import org.apache.nutch.webapp.common.ServiceLocator; +import org.apache.nutch.webapp.common.Startable; +import org.apache.struts.tiles.ComponentContext; + +public class KeyMatchController extends NutchController implements Startable{ + + public static final String ATTR_KEYMATCHES="keymatches"; + + static SimpleKeyMatcher keymatcher; + static HashMap context; + + public void nutchPerform(ComponentContext tileContext, + HttpServletRequest request, HttpServletResponse response, + ServletContext servletContext) throws ServletException, IOException { + ServiceLocator serviceLocator=getServiceLocator(request); + HashMap context=new HashMap(); + KeyMatch[] matches=keymatcher.getMatches(serviceLocator.getSearch().getQuery(),context); + request.setAttribute(ATTR_KEYMATCHES, matches); + } + + public void start(ServletContext servletContext) { + LOG.info("Starting keymatcher"); + ServiceLocator serviceLocator=getServiceLocator(servletContext); + keymatcher=new SimpleKeyMatcher(serviceLocator.getConfiguration()); + context=new HashMap(); + //how many matches + context.put(CountFilter.KEY_COUNT,"1"); + LOG.info("Starting keymatcher ok"); + } +} Added: lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/test/org/apache/nutch/keymatch/TestSimpleKeyMatcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/test/org/apache/nutch/keymatch/TestSimpleKeyMatcher.java?rev=411593&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/test/org/apache/nutch/keymatch/TestSimpleKeyMatcher.java (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/test/org/apache/nutch/keymatch/TestSimpleKeyMatcher.java Sun Jun 4 12:43:47 2006 @@ -0,0 +1,93 @@ +/* + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.keymatch; + +import java.util.HashMap; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.searcher.Query; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +public class TestSimpleKeyMatcher extends TestCase { + + + /* (non-Javadoc) + * @see junit.framework.TestCase#setUp() + */ + protected void setUp() throws Exception { + super.setUp(); + conf=NutchConfiguration.create(); + km=new SimpleKeyMatcher(conf); + } + + SimpleKeyMatcher km; + + Configuration conf; + + /* + * Test method for 'org.apache.nutch.keymatch.SimpleKeyMatcher.getMatches(Query, int)' + */ + public void testGetMatches() { + HashMap context=new HashMap(); + context.put(CountFilter.KEY_COUNT,"1"); + + //keyword + KeyMatch[] matches=getKeyMatchesForString("kw1 kw2 kw3 auto"); + assertEquals(1,matches.length); + + //phrase + matches=getKeyMatchesForString("search engine"); + assertEquals(1,matches.length); + + //exact + phrase + matches=getKeyMatchesForString("apache search engine"); + assertEquals(2,matches.length); + + //exact + matches=getKeyMatchesForString("kw1 kw2 kw3 kw4"); + assertEquals(1,matches.length); + + matches=getKeyMatchesForString("kw2 kw2 kw3 kw4"); + assertEquals(0,matches.length); + + } + + /* + * Test method for 'org.apache.nutch.keymatch.SimpleKeyMatcher.addKeyMatch(Map, KeyMatch, boolean)' + */ + public void testAddKeyMatch() { + KeyMatch keymatch=new KeyMatch("httpd","http://www.apache.org/","apache", KeyMatch.TYPE_KEYWORD); + km.addKeyMatch(keymatch); + KeyMatch matched[]=getKeyMatchesForString("httpd"); + assertTrue(keymatch.equals(matched[0])); + } + + private KeyMatch[] getKeyMatchesForString(String string) { + + Query q; + HashMap context=new HashMap(); + context.put(CountFilter.KEY_COUNT,"1"); + try { + q = Query.parse(string, conf); + return km.getMatches(q,context); + } catch (Exception e){ + + } + return new KeyMatch[0]; + } +} Added: lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/test/org/apache/nutch/keymatch/TestViewCountSorter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/test/org/apache/nutch/keymatch/TestViewCountSorter.java?rev=411593&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/test/org/apache/nutch/keymatch/TestViewCountSorter.java (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/test/org/apache/nutch/keymatch/TestViewCountSorter.java Sun Jun 4 12:43:47 2006 @@ -0,0 +1,87 @@ +/* + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.keymatch; + +import java.util.HashMap; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.searcher.Query; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +public class TestViewCountSorter extends TestCase { + + SimpleKeyMatcher km; + Configuration conf; + + /* (non-Javadoc) + * @see junit.framework.TestCase#setUp() + */ + protected void setUp() throws Exception { + super.setUp(); + conf=NutchConfiguration.create(); + km=new SimpleKeyMatcher(conf); + km.clear(); + KeyMatch m=new KeyMatch("kw1","u1","t1",KeyMatch.TYPE_KEYWORD); + km.addKeyMatch(m); + m=new KeyMatch("kw1","u2","t2",KeyMatch.TYPE_KEYWORD); + km.addKeyMatch(m); + m=new KeyMatch("kw1","u3","t3",KeyMatch.TYPE_KEYWORD); + km.addKeyMatch(m); + ViewCountSorter vcs=new ViewCountSorter(); + vcs.setNext(new CountFilter()); + km.setFilter(vcs); + } + + /* + * Test method for 'org.apache.nutch.keymatch.ViewCountSorter.filter(List, Map)' + */ + public void testFilter() { + KeyMatch m1,m2,m3; + + KeyMatch[] matches=getKeyMatchesForString("kw1"); + m1=matches[0]; + assertNotNull(m1); + + matches=getKeyMatchesForString("kw1"); + m2=matches[0]; + assertNotNull(m2); + + matches=getKeyMatchesForString("kw1"); + m3=matches[0]; + assertNotNull(m3); + + assertFalse(m1.equals(m2)); + assertFalse(m2.equals(m3)); + assertFalse(m1.equals(m3)); + } + + private KeyMatch[] getKeyMatchesForString(String string) { + + Query q; + HashMap context=new HashMap(); + context.put(CountFilter.KEY_COUNT,"1"); + try { + q = Query.parse(string, conf); + return km.getMatches(q,context); + } catch (Exception e){ + + } + return new KeyMatch[0]; + } + +} Added: lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/web/web-keymatch/keymatch.jsp URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/web/web-keymatch/keymatch.jsp?rev=411593&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/web/web-keymatch/keymatch.jsp (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/src/web/web-keymatch/keymatch.jsp Sun Jun 4 12:43:47 2006 @@ -0,0 +1,11 @@ +<%@ page session="false"%> +<%@ taglib prefix="tiles" uri="http://jakarta.apache.org/struts/tags-tiles"%> +<%@ taglib prefix="c" uri="http://java.sun.com/jstl/core"%> +<%@ taglib prefix="fmt" uri="http://java.sun.com/jstl/fmt"%> +<div id="keymatch"> +<c:forEach var="keymatch" items="${keymatches}"> + <a href="search.do?query=<c:out value="${keymatch.url}"/>"> + <c:out value="${keymatch.title}"/> + </a><br/> +</c:forEach> +</div> _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs