svn commit: r412399 - /lucene/nutch/trunk/conf/mime-types.xml
Author: jerome Date: Wed Jun 7 06:07:27 2006 New Revision: 412399 URL: http://svn.apache.org/viewvc?rev=412399view=rev Log: NUTCH-275 : Remove the magic resolution for xml content-type Modified: lucene/nutch/trunk/conf/mime-types.xml Modified: lucene/nutch/trunk/conf/mime-types.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/mime-types.xml?rev=412399r1=412398r2=412399view=diff == --- lucene/nutch/trunk/conf/mime-types.xml (original) +++ lucene/nutch/trunk/conf/mime-types.xml Wed Jun 7 06:07:27 2006 @@ -693,7 +693,7 @@ mime-type name=text/xml description=Extensible Markup Language File extxml/extextxsl/ext -magic offset=0 value=lt;?xml/ +!--magic offset=0 value=lt;?xml/-- /mime-type mime-type name=text/x-setext
svn commit: r412577 - in /lucene/nutch/trunk/src/test/org/apache/nutch/util/mime: mime-types.txt test.xml
Author: jerome Date: Wed Jun 7 15:06:53 2006 New Revision: 412577 URL: http://svn.apache.org/viewvc?rev=412577view=rev Log: NUTCH-275 : Remove unit test for magic based content type guessing for xml Removed: lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/test.xml Modified: lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/mime-types.txt Modified: lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/mime-types.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/mime-types.txt?rev=412577r1=412576r2=412577view=diff == --- lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/mime-types.txt (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/mime-types.txt Wed Jun 7 15:06:53 2006 @@ -136,7 +136,7 @@ text/tab-separated-values;tsv text/vnd.wap.wml;wml text/vnd.wap.wmlscript;wmls -text/xml;xml;test.xml +text/xml;xml text/xml;xsl text/x-setext;etx video/mpeg;mpg
svn commit: r412582 - /lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java
Author: jerome Date: Wed Jun 7 15:19:08 2006 New Revision: 412582 URL: http://svn.apache.org/viewvc?rev=412582view=rev Log: NUTCH-301 : CommonTerms are cached in the Configuration Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java?rev=412582r1=412581r2=412582view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java Wed Jun 7 15:19:08 2006 @@ -37,7 +37,10 @@ private static final Logger LOG = LogFormatter.getLogger(org.apache.nutch.analysis.CommonGrams); private static final char SEPARATOR = '-'; - private HashMap COMMON_TERMS = new HashMap(); + /** The key used to cache commonTerms in Configuration */ + private static final String KEY = CommonGrams.class.getName(); + + private HashMap commonTerms = new HashMap(); /** * The constructor. @@ -135,7 +138,13 @@ /** Construct using the provided config file. */ private void init(Configuration conf) { +// First, try to retrieve some commonTerms cached in configuration. +commonTerms = (HashMap) conf.getObject(KEY); +if (commonTerms != null) { return; } + +// Otherwise, read the terms.file try { + commonTerms = new HashMap(); Reader reader = conf.getConfResourceAsReader (conf.get(analysis.common.terms.file)); BufferedReader in = new BufferedReader(reader); @@ -160,13 +169,14 @@ while ((token = ts.next()) != null) { gram = gram + SEPARATOR + token.termText(); } -HashSet table = (HashSet)COMMON_TERMS.get(field); +HashSet table = (HashSet)commonTerms.get(field); if (table == null) { table = new HashSet(); - COMMON_TERMS.put(field, table); + commonTerms.put(field, table); } table.add(gram); } + conf.setObject(KEY, commonTerms); } catch (IOException e) { throw new RuntimeException(e.toString()); } @@ -175,7 +185,7 @@ /** Construct a token filter that inserts n-grams for common terms. For use * while indexing documents. */ public TokenFilter getFilter(TokenStream ts, String field) { -return new Filter(ts, (HashSet)COMMON_TERMS.get(field)); +return new Filter(ts, (HashSet)commonTerms.get(field)); } /** Utility to convert an array of Query.Terms into a token stream. */
svn commit: r411926 - in /lucene/nutch/trunk/src/plugin/lib-http/src: java/org/apache/nutch/protocol/http/api/RobotRulesParser.java test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
Author: jerome Date: Mon Jun 5 14:43:42 2006 New Revision: 411926 URL: http://svn.apache.org/viewvc?rev=411926view=rev Log: NUTCH-298 : No more NPE if a 404 for a robots.txt + some unit tests Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?rev=411926r1=411925r2=411926view=diff == --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Mon Jun 5 14:43:42 2006 @@ -70,8 +70,8 @@ * file, and can test paths against those rules. */ public static class RobotRuleSet { -ArrayList tmpEntries; -RobotsEntry[] entries; +ArrayList tmpEntries = new ArrayList(); +RobotsEntry[] entries = null; long expireTime; /** Modified: lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java?rev=411926r1=411925r2=411926view=diff == --- lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java Mon Jun 5 14:43:42 2006 @@ -25,7 +25,29 @@ private static final String CR= \r; private static final String CRLF= \r\n; - + private static final boolean[] ACCEPT_ALL = { +true, // /a, +true, // /a/, +true, // /a/bloh/foo.html +true, // /b, +true, // /b/a, +true, // /b/a/index.html, +true, // /b/b/foo.html, +true, // /c, +true, // /c/a, +true, // /c/a/index.html, +true, // /c/b/foo.html, +true, // /d, +true, // /d/a, +true, // /e/a/index.html, +true, // /e/d, +true, // /e/d/foo.html, +true, // /e/doh.html, +true, // /f/index.html, +true, // /foo/bar.html, +true, // /f/, + }; + private static final String[] ROBOTS_STRINGS= new String[] { User-Agent: Agent1 #foo + CR + Disallow: /a + CR @@ -40,6 +62,7 @@ + + CR + User-Agent: * + CR + Disallow: /foo/bar/ + CR, +null // Used to test EMPTY_RULES }; private static final String[] AGENT_STRINGS= new String[] { @@ -57,7 +80,14 @@ false, false, true, -} +}, +{ + false, + false, + false, + false, + true, +} }; private static final String[] TEST_PATHS= new String[] { @@ -195,6 +225,13 @@ false, // /foo/bar.html, true, // /f/, } +}, +{ // ROBOTS_STRINGS[1] + ACCEPT_ALL, // Agent 1 + ACCEPT_ALL, // Agent 2 + ACCEPT_ALL, // Agent 3 + ACCEPT_ALL, // Agent 4 + ACCEPT_ALL, // Agent 5 } }; @@ -233,7 +270,9 @@ for (int i= 1; i agents.length; i++) agentsString= agentsString + , + agents[i]; RobotRulesParser p= new RobotRulesParser(agents); -RobotRuleSet rules= p.parseRules(ROBOTS_STRINGS[robotsString].getBytes()); +RobotRuleSet rules= p.parseRules(ROBOTS_STRINGS[robotsString] != null + ? ROBOTS_STRINGS[robotsString].getBytes() + : null); for (int i= 0; i paths.length; i++) { assertTrue(testing robots file +robotsString+, on agents ( + agentsString + ), and path + TEST_PATHS[i] + ; got @@ -243,4 +282,6 @@ } } + + }
svn commit: r411935 - /lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java
Author: jerome Date: Mon Jun 5 15:02:35 2006 New Revision: 411935 URL: http://svn.apache.org/viewvc?rev=411935view=rev Log: NucthAnalyzer is now Configurable (Teruhiko Kurosaka) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java?rev=411935r1=411934r2=411935view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java Mon Jun 5 15:02:35 2006 @@ -22,6 +22,10 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; +// Hadoop imports +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; + // Nutch imports import org.apache.nutch.plugin.Pluggable; @@ -34,16 +38,37 @@ * @author Jeacute;rocirc;me Charron */ public abstract class NutchAnalyzer extends Analyzer -implements Pluggable { +implements Configurable, Pluggable { /** The name of the extension point. */ final static String X_POINT_ID = NutchAnalyzer.class.getName(); + /** The current Configuration */ + protected Configuration conf = null; + /** * Creates a TokenStream which tokenizes all the text in the provided Reader. */ public abstract TokenStream tokenStream(String fieldName, Reader reader); - + + /* - * + * implementation:Configurable * + * - */ + + // Inherited Javadoc + public void setConf(Configuration conf) { +this.conf = conf; + } + + // Inherited Javadoc + public Configuration getConf() { +return this.conf; + } + + /* -- * + * /implementation:Configurable * + * -- */ + }
svn commit: r406053 - in /lucene/nutch/trunk/src: java/org/apache/nutch/clustering/ java/org/apache/nutch/parse/ java/org/apache/nutch/scoring/ plugin/protocol-httpclient/src/java/org/apache/nutch/pro
Author: jerome Date: Sat May 13 02:14:53 2006 New Revision: 406053 URL: http://svn.apache.org/viewcvs?rev=406053view=rev Log: Fix Javadoc Warnings Modified: lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java?rev=406053r1=406052r2=406053view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java Sat May 13 02:14:53 2006 @@ -60,7 +60,7 @@ /** * @return Returns the online clustering extension specified - * in nutch configuration (key name in this field: [EMAIL PROTECTED] #CONFIG_FIELD_NAME}). + * in nutch configuration (key name is codeextension.clustering.extension-name/code). * If the name is empty (no preference), the first available clustering extension is * returned. */ Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java?rev=406053r1=406052r2=406053view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java Sat May 13 02:14:53 2006 @@ -46,7 +46,7 @@ * @author mattmann * @version 1.0 */ -public class ParsePluginsReader { +class ParsePluginsReader { /* our log stream */ public static final Logger LOG = @@ -66,9 +66,9 @@ /** * Reads the codeparse-plugins.xml/code file and returns the - * [EMAIL PROTECTED] ParsePluginList} defined by it. + * [EMAIL PROTECTED] #ParsePluginList} defined by it. * - * @return A [EMAIL PROTECTED] ParsePluginList} specified by the + * @return A [EMAIL PROTECTED] #ParsePluginList} specified by the * codeparse-plugins.xml/code file. * @throws Exception * If any parsing error occurs. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java?rev=406053r1=406052r2=406053view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Sat May 13 02:14:53 2006 @@ -74,7 +74,7 @@ /** * Function returns an array of [EMAIL PROTECTED] Parser}s for a given content type. * - * The function consults the internal [EMAIL PROTECTED] ParsePluginList} for the + * The function consults the internal list of parse plugins for the * ParserFactory to determine the list of pluginIds, then gets the * appropriate extension points to instantiate as [EMAIL PROTECTED] Parser}s. * @@ -149,12 +149,12 @@ * instantiate the Parser itself , then this function will cache that Parser * in the internal codePARSER_CACHE/code. * - * @param extId The string extension ID (e.g., + * @param id The string extension ID (e.g., *org.apache.nutch.parse.rss.RSSParser, *org.apache.nutch.parse.rtf.RTFParseFactory) of the [EMAIL PROTECTED] Parser} *implementation to return. * @return A [EMAIL PROTECTED] Parser} implementation specified by the parameter - * codeextId/code. + * codeid/code. * @throws ParserNotFound If the Parser is not found (i.e., registered with * the extension point), or if the there a * [EMAIL PROTECTED] PluginRuntimeException} instantiating the [EMAIL PROTECTED] Parser}. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java?rev=406053r1=406052r2=406053view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch
svn commit: r405566 - /lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java
Author: jerome Date: Tue May 9 16:06:17 2006 New Revision: 405566 URL: http://svn.apache.org/viewcvs?rev=405566view=rev Log: NUTCH-134 - No more needs for the clusterer to remove html tags from summaries Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java?rev=405566r1=405565r2=405566view=diff == --- lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java (original) +++ lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java Tue May 9 16:06:17 2006 @@ -34,7 +34,6 @@ import com.dawidweiss.carrot.core.local.ProcessingException; import com.dawidweiss.carrot.core.local.RequestContext; import com.dawidweiss.carrot.core.local.clustering.*; -import com.dawidweiss.carrot.util.common.StringUtils; /** * A local input component that ignores the query passed from the @@ -103,7 +102,7 @@ // produce 'documents' for successor components. final RawDocumentsConsumer consumer = (RawDocumentsConsumer) next; for (int i=0;isummaries.length;i++) { - consumer.addDocument(new NutchDocument(i, details[i], htmlToText(summaries[i]), defaultLanguage)); + consumer.addDocument(new NutchDocument(i, details[i], summaries[i], defaultLanguage)); } } @@ -121,14 +120,4 @@ return SUCCESSOR_CAPABILITIES; } - /** - * Converts a html chunk to plain text. - * - * This method is only required because Nutch's summaries are in HTML. - * I guess it would be possible to get rid of the code below by - * adding patches/ methods to Nutch that return plain text summaries. - */ - private final String htmlToText(String html) { -return StringUtils.removeMarkup(html); - } }
svn commit: r405165 - in /lucene/nutch/trunk: ./ conf/ src/java/org/apache/nutch/searcher/ src/plugin/ src/plugin/nutch-extensionpoints/ src/plugin/summary-basic/ src/plugin/summary-basic/src/ src/plu
Author: jerome Date: Mon May 8 14:04:01 2006 New Revision: 405165 URL: http://svn.apache.org/viewcvs?rev=405165view=rev Log: NUTCH-134 : Added a summarizer extension point and two enxtensions: * summary-basic is the current nutch implementation moved into a plugin * summary-lucene a raw version of a summarizer plugin based on lucene highlighter Added: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SummarizerFactory.java (with props) lucene/nutch/trunk/src/plugin/summary-basic/ lucene/nutch/trunk/src/plugin/summary-basic/build.xml (with props) lucene/nutch/trunk/src/plugin/summary-basic/plugin.xml (with props) lucene/nutch/trunk/src/plugin/summary-basic/src/ lucene/nutch/trunk/src/plugin/summary-basic/src/java/ lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/ lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/ lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/ lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/ lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/ lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java (with props) lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/package.html (with props) lucene/nutch/trunk/src/plugin/summary-lucene/ lucene/nutch/trunk/src/plugin/summary-lucene/build.xml (with props) lucene/nutch/trunk/src/plugin/summary-lucene/lib/ lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.0-rc1-dev.jar (with props) lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml (with props) lucene/nutch/trunk/src/plugin/summary-lucene/src/ lucene/nutch/trunk/src/plugin/summary-lucene/src/java/ lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/ lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/ lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/ lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/ lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/ lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/LuceneSummarizer.java (with props) lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/package.html (with props) Modified: lucene/nutch/trunk/build.xml lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/default.properties lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=405165r1=405164r2=405165view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Mon May 8 14:04:01 2006 @@ -323,6 +323,8 @@ packageset dir=${plugins.dir}/query-more/src/java/ packageset dir=${plugins.dir}/query-site/src/java/ packageset dir=${plugins.dir}/query-url/src/java/ + packageset dir=${plugins.dir}/summary-basic/src/java/ + packageset dir=${plugins.dir}/summary-lucene/src/java/ packageset dir=${plugins.dir}/urlfilter-automaton/src/java/ packageset dir=${plugins.dir}/urlfilter-regex/src/java/ packageset dir=${plugins.dir}/urlfilter-prefix/src/java/ @@ -350,6 +352,7 @@ group title=Analysis Plugins packages=${plugins.analysis}/ group title=Indexing Filter Plugins packages=${plugins.index}/ group title=Query Filter Plugins packages=${plugins.query}/ + group title=Summary Plugins packages=${plugins.summary}/ group title=Clustering Plugins packages=${plugins.clustering}/ group title=Ontology Plugins packages=${plugins.ontology}/ group title=Misc. Plugins packages=${plugins.misc}/ Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=405165r1=405164r2=405165view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Mon May 8 14:04:01 2006 @@ -564,7 +564,7 @@ property nameplugin.includes/name - valueprotocol-http|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)/value + valueprotocol-http|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)|summary-basic/value descriptionRegular expression naming plugin directory names to include. Any plugin not matching this expression is excluded. In any
svn commit: r394228 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/plugin/ src/plugin/ src/plugin/analysis-de/ src/plugin/analysis-fr/ src/plugin/clustering-carrot2/ src/plugin/creativecommons
Author: jerome Date: Fri Apr 14 16:57:24 2006 New Revision: 394228 URL: http://svn.apache.org/viewcvs?rev=394228view=rev Log: NUTCH-245 : Added a DTD for Nutch Plugin Manifest - Add a commented DTD in src - Add the DTD in javadoc - Change the implementation element structure : uses name-value parameters instead of proprietary attributes - Fix unit tests regarding changes in DTD - Fix the plugin.xml file in nutch plugins regarding changes in DTD Added: lucene/nutch/trunk/src/plugin/plugin.dtd (with props) Modified: lucene/nutch/trunk/build.xml lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java lucene/nutch/trunk/src/java/org/apache/nutch/plugin/package.html lucene/nutch/trunk/src/plugin/analysis-de/plugin.xml lucene/nutch/trunk/src/plugin/analysis-fr/plugin.xml lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml lucene/nutch/trunk/src/plugin/creativecommons/plugin.xml lucene/nutch/trunk/src/plugin/index-basic/plugin.xml lucene/nutch/trunk/src/plugin/index-more/plugin.xml lucene/nutch/trunk/src/plugin/languageidentifier/plugin.xml lucene/nutch/trunk/src/plugin/lib-commons-httpclient/plugin.xml lucene/nutch/trunk/src/plugin/lib-http/plugin.xml lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml lucene/nutch/trunk/src/plugin/lib-log4j/plugin.xml lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml lucene/nutch/trunk/src/plugin/lib-nekohtml/plugin.xml lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml lucene/nutch/trunk/src/plugin/lib-regex-filter/plugin.xml lucene/nutch/trunk/src/plugin/lib-xml/plugin.xml lucene/nutch/trunk/src/plugin/microformats-reltag/plugin.xml lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml lucene/nutch/trunk/src/plugin/ontology/plugin.xml lucene/nutch/trunk/src/plugin/parse-ext/plugin.xml lucene/nutch/trunk/src/plugin/parse-html/plugin.xml lucene/nutch/trunk/src/plugin/parse-js/plugin.xml lucene/nutch/trunk/src/plugin/parse-mp3/plugin.xml lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml lucene/nutch/trunk/src/plugin/parse-rtf/plugin.xml lucene/nutch/trunk/src/plugin/parse-swf/plugin.xml lucene/nutch/trunk/src/plugin/parse-text/plugin.xml lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml lucene/nutch/trunk/src/plugin/protocol-file/plugin.xml lucene/nutch/trunk/src/plugin/protocol-ftp/plugin.xml lucene/nutch/trunk/src/plugin/protocol-http/plugin.xml lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml lucene/nutch/trunk/src/plugin/query-basic/plugin.xml lucene/nutch/trunk/src/plugin/query-more/plugin.xml lucene/nutch/trunk/src/plugin/query-site/plugin.xml lucene/nutch/trunk/src/plugin/query-url/plugin.xml lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml lucene/nutch/trunk/src/plugin/urlfilter-prefix/plugin.xml lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml lucene/nutch/trunk/src/test/org/apache/nutch/plugin/TestPluginSystem.java Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=394228r1=394227r2=394228view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Fri Apr 14 16:57:24 2006 @@ -279,6 +279,9 @@ !-- Documentation -- !-- == -- target name=javadoc depends=compile +!-- Copy the plugin.dtd file to the plugin doc-files dir -- +copy file=${plugins.dir}/plugin.dtd + todir=${src.dir}/org/apache/nutch/plugin/doc-files/ mkdir dir=${build.javadoc}/ javadoc overview=${src.dir}/overview.html @@ -353,6 +356,7 @@ group title=Ontology Plugins packages=${plugins.ontology}/ group title=Misc. Plugins packages=${plugins.misc}/ /javadoc + /target target name=default-doc Modified: lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java?rev=394228r1=394227r2=394228view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java Fri Apr 14 16:57:24 2006 @@ -36,6 +36,8 @@ import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; +import org.xml.sax.EntityResolver
svn commit: r394231 - in /lucene/nutch/trunk: build.xml src/plugin/plugin.dtd
Author: jerome Date: Fri Apr 14 17:13:21 2006 New Revision: 394231 URL: http://svn.apache.org/viewcvs?rev=394231view=rev Log: NUTCH-245 : Some minor fixes - Added Apache License in DTD (?) - Delete the org/apache/nutch/plugin/doc-files once javadoc task completed. Modified: lucene/nutch/trunk/build.xml lucene/nutch/trunk/src/plugin/plugin.dtd Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=394231r1=394230r2=394231view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Fri Apr 14 17:13:21 2006 @@ -356,7 +356,8 @@ group title=Ontology Plugins packages=${plugins.ontology}/ group title=Misc. Plugins packages=${plugins.misc}/ /javadoc - +!-- Clean the doc-files dir from src -- +delete dir=${src.dir}/org/apache/nutch/plugin/doc-files/ /target target name=default-doc Modified: lucene/nutch/trunk/src/plugin/plugin.dtd URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/plugin.dtd?rev=394231r1=394230r2=394231view=diff == --- lucene/nutch/trunk/src/plugin/plugin.dtd (original) +++ lucene/nutch/trunk/src/plugin/plugin.dtd Fri Apr 14 17:13:21 2006 @@ -1,10 +1,25 @@ ?xml version=1.0 encoding=UTF-8? !-- + ! Copyright 2005 The Apache Software Foundation + ! + ! Licensed under the Apache License, Version 2.0 (the License); + ! you may not use this file except in compliance with the License. + ! You may obtain a copy of the License at + ! + ! http://www.apache.org/licenses/LICENSE-2.0 + ! + ! Unless required by applicable law or agreed to in writing, software + ! distributed under the License is distributed on an AS IS BASIS, + ! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ! See the License for the specific language governing permissions and + ! limitations under the License. + ! + ! ! Document : plugin.dtd ! Created on : 14 avril 2006, 22:14 - ! Author : Jerome Charron - ! Description: Nutch plug-in manifest + ! Author : Chris Mattmann, Jerome Charron + ! Description: Nutch plug-in manifest DTD ! ! PUBLIC ID : -//Apache Software Fundation//DTD Nutch Plugin Manifest 1.0//EN ! SYSTEM ID : http://lucene.apache.org/nutch/plugin.dtd
svn commit: r391958 - in /lucene/nutch/trunk: conf/nutch-default.xml src/java/org/apache/nutch/parse/ParseData.java src/test/org/apache/nutch/parse/TestParseData.java src/test/org/apache/nutch/util/Wr
Author: jerome Date: Thu Apr 6 03:49:40 2006 New Revision: 391958 URL: http://svn.apache.org/viewcvs?rev=391958view=rev Log: NUTCH-244, db.max.outlinks.per.page can now be negative for no limit of handled outlinks per page Modified: lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java lucene/nutch/trunk/src/test/org/apache/nutch/util/WritableTestUtils.java Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=391958r1=391957r2=391958view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Thu Apr 6 03:49:40 2006 @@ -255,6 +255,8 @@ namedb.max.outlinks.per.page/name value100/value descriptionThe maximum number of outlinks that we'll process for a page. + If this value is nonnegative (=0), at most db.max.outlinks.per.page outlinks + will be processed for a page; otherwise, all outlinks will be processed. /description /property Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java?rev=391958r1=391957r2=391958view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java Thu Apr 6 03:49:40 2006 @@ -119,12 +119,15 @@ int totalOutlinks = in.readInt(); // read outlinks int maxOutlinksPerPage = this.conf.getInt(db.max.outlinks.per.page, 100); -int outlinksToRead = Math.min(maxOutlinksPerPage, totalOutlinks); +int outlinksToRead = totalOutlinks; +if (maxOutlinksPerPage = 0) { + outlinksToRead = Math.min(maxOutlinksPerPage, totalOutlinks); +} outlinks = new Outlink[outlinksToRead]; for (int i = 0; i outlinksToRead; i++) { outlinks[i] = Outlink.read(in); } -for (int i = maxOutlinksPerPage; i totalOutlinks; i++) { +for (int i = outlinksToRead; i totalOutlinks; i++) { Outlink.skip(in); } Modified: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java?rev=391958r1=391957r2=391958view=diff == --- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java Thu Apr 6 03:49:40 2006 @@ -51,4 +51,31 @@ WritableTestUtils.testWritable(r, conf); } + public void testMaxOutlinks() throws Exception { +Outlink[] outlinks = new Outlink[128]; +for (int i=0; ioutlinks.length; i++) { + outlinks[i] = new Outlink(http://outlink.com/; + i, Outlink + i, conf); +} +ParseData original = new ParseData(ParseStatus.STATUS_SUCCESS, + Max Outlinks Title, + outlinks, + new Metadata()); +Configuration conf = NutchConfiguration.create(); +// No Outlinks +conf.setInt(db.max.outlinks.per.page, 0); +ParseData data = (ParseData) WritableTestUtils.writeRead(original, conf); +assertEquals(0, data.getOutlinks().length); +// Only 100 Outlinks +conf.setInt(db.max.outlinks.per.page, 100); +data = (ParseData) WritableTestUtils.writeRead(original, conf); +assertEquals(100, data.getOutlinks().length); +// 256 Outlinks +conf.setInt(db.max.outlinks.per.page, 256); +data = (ParseData) WritableTestUtils.writeRead(original, conf); +assertEquals(outlinks.length, data.getOutlinks().length); +// All Outlinks +conf.setInt(db.max.outlinks.per.page, -1); +data = (ParseData) WritableTestUtils.writeRead(original, conf); +assertEquals(outlinks.length, data.getOutlinks().length); + } } Modified: lucene/nutch/trunk/src/test/org/apache/nutch/util/WritableTestUtils.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/util/WritableTestUtils.java?rev=391958r1=391957r2=391958view=diff == --- lucene/nutch/trunk/src/test/org/apache/nutch/util/WritableTestUtils.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/util/WritableTestUtils.java Thu Apr 6 03:49:40 2006 @@ -31,6 +31,14 @@ /** Utility method for testing writables. */ public static void testWritable(Writable before, Configuration conf) throws Exception { +TestCase.assertEquals(before, writeRead(before, conf
svn commit: r391150 - /lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
Author: jerome Date: Mon Apr 3 13:57:46 2006 New Revision: 391150 URL: http://svn.apache.org/viewcvs?rev=391150view=rev Log: no more dump parse-mspowerpoint unit test result to a file for visual checks Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java?rev=391150r1=391149r2=391150view=diff == --- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java Mon Apr 3 13:57:46 2006 @@ -70,7 +70,7 @@ /** * Wether dumping the extracted data to file for visual checks. */ - private final static boolean DUMP_TO_FILE = true; + private final static boolean DUMP_TO_FILE = false; private final File testFile;
svn commit: r390392 - in /lucene/nutch/trunk/src/java/org/apache/nutch: analysis/ clustering/ indexer/ net/ ontology/ parse/ plugin/ protocol/ searcher/
Author: jerome Date: Fri Mar 31 03:04:43 2006 New Revision: 390392 URL: http://svn.apache.org/viewcvs?rev=390392view=rev Log: Add a common Pluggable interface to all Extension Points and a package description for plugin Added: lucene/nutch/trunk/src/java/org/apache/nutch/plugin/Pluggable.java (with props) lucene/nutch/trunk/src/java/org/apache/nutch/plugin/package.html (with props) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilter.java lucene/nutch/trunk/src/java/org/apache/nutch/ontology/Ontology.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/Parser.java lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilter.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java?rev=390392r1=390391r2=390392view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java Fri Mar 31 03:04:43 2006 @@ -22,6 +22,9 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; +// Nutch imports +import org.apache.nutch.plugin.Pluggable; + /** * Extension point for analysis. @@ -30,7 +33,8 @@ * * @author Jeacute;rocirc;me Charron */ -public abstract class NutchAnalyzer extends Analyzer { +public abstract class NutchAnalyzer extends Analyzer +implements Pluggable { /** The name of the extension point. */ final static String X_POINT_ID = NutchAnalyzer.class.getName(); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java?rev=390392r1=390391r2=390392view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java Fri Mar 31 03:04:43 2006 @@ -16,8 +16,11 @@ package org.apache.nutch.clustering; +// Nutch imports +import org.apache.nutch.plugin.Pluggable; import org.apache.nutch.searcher.HitDetails; + /** * An extension point interface for online search results clustering * algorithms. @@ -33,7 +36,7 @@ * @author Dawid Weiss * @version $Id: OnlineClusterer.java,v 1.1 2004/08/09 23:23:52 johnnx Exp $ */ -public interface OnlineClusterer { +public interface OnlineClusterer extends Pluggable { /** The name of the extension point. */ public final static String X_POINT_ID = OnlineClusterer.class.getName(); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java?rev=390392r1=390391r2=390392view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java Fri Mar 31 03:04:43 2006 @@ -16,18 +16,25 @@ package org.apache.nutch.indexer; +// Lucene imports import org.apache.lucene.document.Document; -import org.apache.nutch.parse.Parse; + +// Hadoop imports import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.io.UTF8; + +// Nutch imports +import org.apache.nutch.parse.Parse; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; -import org.apache.hadoop.io.UTF8; +import org.apache.nutch.plugin.Pluggable; + /** Extension point for indexing. Permits one to add metadata to the indexed * fields. All plugins found which implement this extension point are run * sequentially on the parse. */ -public interface IndexingFilter extends Configurable { +public interface IndexingFilter extends Pluggable, Configurable { /** The name of the extension point. */ final static String X_POINT_ID = IndexingFilter.class.getName(); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilter.java?rev=390392r1=390391r2=390392view=diff == --- lucene/nutch/trunk/src
svn commit: r389729 - in /lucene/nutch/trunk/src/plugin/parse-rss: build.xml lib/jaxen-core.jar lib/jaxen-jdom.jar lib/jdom.jar lib/saxpath.jar lib/xercesImpl.jar lib/xml-apis.jar plugin.xml
Author: jerome Date: Wed Mar 29 01:45:01 2006 New Revision: 389729 URL: http://svn.apache.org/viewcvs?rev=389729view=rev Log: parse-rss now depends on new lib-xml plugin Removed: lucene/nutch/trunk/src/plugin/parse-rss/lib/jaxen-core.jar lucene/nutch/trunk/src/plugin/parse-rss/lib/jaxen-jdom.jar lucene/nutch/trunk/src/plugin/parse-rss/lib/jdom.jar lucene/nutch/trunk/src/plugin/parse-rss/lib/saxpath.jar lucene/nutch/trunk/src/plugin/parse-rss/lib/xercesImpl.jar lucene/nutch/trunk/src/plugin/parse-rss/lib/xml-apis.jar Modified: lucene/nutch/trunk/src/plugin/parse-rss/build.xml lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml Modified: lucene/nutch/trunk/src/plugin/parse-rss/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/build.xml?rev=389729r1=389728r2=389729view=diff == --- lucene/nutch/trunk/src/plugin/parse-rss/build.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-rss/build.xml Wed Mar 29 01:45:01 2006 @@ -6,18 +6,21 @@ !-- Build compilation dependencies -- target name=deps-jar + ant target=jar inheritall=false dir=../lib-xml/ ant target=jar inheritall=false dir=../lib-log4j/ /target !-- Add compilation dependencies to classpath -- path id=plugin.deps fileset dir=${nutch.root}/build + include name=**/lib-xml/*.jar / include name=**/lib-log4j/*.jar / /fileset /path !-- Deploy Unit test dependencies -- target name=deps-test + ant target=deploy inheritall=false dir=../lib-xml/ ant target=deploy inheritall=false dir=../lib-log4j/ ant target=deploy inheritall=false dir=../nutch-extensionpoints/ ant target=deploy inheritall=false dir=../lib-commons-httpclient/ Modified: lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml?rev=389729r1=389728r2=389729view=diff == --- lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml Wed Mar 29 01:45:01 2006 @@ -10,19 +10,13 @@ library name=parse-rss.jar export name=*/ /library - library name=jdom.jar/ - library name=jaxen-core.jar/ - library name=jaxen-jdom.jar/ library name=commons-feedparser-0.6-fork.jar/ - library name=saxpath.jar/ - library name=log4j-1.2.6.jar/ - library name=xercesImpl.jar/ - library name=xml-apis.jar/ library name=xml-rpc-1.2.jar/ /runtime requires import plugin=nutch-extensionpoints/ + import plugin=lib-xml/ import plugin=lib-log4j/ import plugin=lib-commons-httpclient/ /requires
svn commit: r389456 - /lucene/nutch/trunk/build.xml
Author: jerome Date: Tue Mar 28 01:40:29 2006 New Revision: 389456 URL: http://svn.apache.org/viewcvs?rev=389456view=rev Log: NUTCH-210, forgot to commit the nutch.xml xsl generation Modified: lucene/nutch/trunk/build.xml Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=389456r1=389455r2=389456view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Tue Mar 28 01:40:29 2006 @@ -144,6 +144,14 @@ !---- !-- == -- target name=war depends=jar,compile,generate-docs + +!-- generate the nutch.xml (servlet context) file -- +xslt in=${basedir}/conf/nutch-default.xml + out=${build.dir}/nutch.xml + style=${basedir}/conf/context.xsl +xmlcatalog refid=docDTDs/ + outputproperty name=indent value=yes/ +/xslt war destfile=${build.dir}/${final.name}.war webxml=${web.src.dir}/web.xml fileset dir=${web.src.dir}/jsp/
svn commit: r389514 - /lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java
Author: jerome Date: Tue Mar 28 06:56:21 2006 New Revision: 389514 URL: http://svn.apache.org/viewcvs?rev=389514view=rev Log: NUTCH-210, make use of nutch.xml servlet context in opensearch servlet too Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java?rev=389514r1=389513r2=389514view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java Tue Mar 28 06:56:21 2006 @@ -62,7 +62,7 @@ public void init(ServletConfig config) throws ServletException { try { - this.conf = NutchConfiguration.create(); + this.conf = NutchConfiguration.get(config.getServletContext()); bean = NutchBean.get(config.getServletContext(), this.conf); } catch (IOException e) { throw new ServletException(e);
svn commit: r389160 - in /lucene/nutch/trunk: conf/ src/java/org/apache/nutch/util/ src/web/jsp/
Author: jerome Date: Mon Mar 27 06:52:14 2006 New Revision: 389160 URL: http://svn.apache.org/viewcvs?rev=389160view=rev Log: NUTCH-210, Add an xsl that generates a basic ServletContext XML file for the nutch webapp and make use of the ServletContext init parameters to override the properties in nutch-default and nutch-site properties. Contributed by Chris Mattmann. Added: lucene/nutch/trunk/conf/context.xsl (with props) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java lucene/nutch/trunk/src/web/jsp/anchors.jsp lucene/nutch/trunk/src/web/jsp/cached.jsp lucene/nutch/trunk/src/web/jsp/explain.jsp lucene/nutch/trunk/src/web/jsp/refine-query-init.jsp lucene/nutch/trunk/src/web/jsp/search.jsp lucene/nutch/trunk/src/web/jsp/text.jsp Added: lucene/nutch/trunk/conf/context.xsl URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/context.xsl?rev=389160view=auto == --- lucene/nutch/trunk/conf/context.xsl (added) +++ lucene/nutch/trunk/conf/context.xsl Mon Mar 27 06:52:14 2006 @@ -0,0 +1,87 @@ +?xml version=1.0 encoding=ISO-8859-1? +!-- + Copyright 2006 The Apache Software Foundation + + Licensed under the Apache License, Version 2.0 (the License); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an AS IS BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + Author : Chris Mattmann +Auhtor : Jérôme Charron + Description: This xsl file is used to transform dynamic properties out of the + nutch-default.xml file into a deployable Context.xml file for configuring + the Nutch war file in a servlet container. +-- + +xsl:stylesheet version=1.0 xmlns:xsl=http://www.w3.org/1999/XSL/Transform; + +xsl:template match=/ +Context path=/nutch docBase=nutch-0.8-dev.war +debug=5 reloadable=true crossContext=true + +xsl:for-each select=configuration/property + +!-- searcher. properties -- +xsl:call-template name=parameter + xsl:with-param name=property select=current()/ + xsl:with-param name=filter select='searcher.'/ +/xsl:call-template + +!-- plugin. properties -- +xsl:call-template name=parameter + xsl:with-param name=property select=current()/ + xsl:with-param name=filter select='plugin.'/ +/xsl:call-template + +!-- extension.clustering. properties -- +xsl:call-template name=parameter + xsl:with-param name=property select=current()/ + xsl:with-param name=filter select='extension.clustering.'/ +/xsl:call-template + +!-- extension.ontology. properties -- +xsl:call-template name=parameter + xsl:with-param name=property select=current()/ + xsl:with-param name=filter select='extension.ontology.'/ +/xsl:call-template + +!-- query. properties -- +xsl:call-template name=parameter + xsl:with-param name=property select=current()/ + xsl:with-param name=filter select='query.'/ +/xsl:call-template + +/xsl:for-each + +/Context +/xsl:template + + +!-- + ! Template used to write out a parameter if the property's + ! name contains the specified filter string. + !-- +xsl:template name=parameter + xsl:param name=property/ + xsl:param name=filter/ + xsl:if test=contains(name, $filter) + Parameter override=false +xsl:attribute name=name + xsl:value-of select=name/ +/xsl:attribute +xsl:attribute name=value + xsl:value-of select=value/ +/xsl:attribute + /Parameter + /xsl:if +/xsl:template + +/xsl:stylesheet Propchange: lucene/nutch/trunk/conf/context.xsl -- svn:eol-style = native Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java?rev=389160r1=389159r2=389160view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java Mon Mar 27 06:52:14 2006 @@ -16,12 +16,23 @@ package org.apache.nutch.util; +// JDK imports +import java.util.Enumeration; + +// Servlet imports +import javax.servlet.ServletContext; + +// Hadoop imports import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.WritableName; + /** Utility to create Hadoop [EMAIL PROTECTED] Configuration}s that include Nutch-specific * resources. */ public class
svn commit: r388293 - /lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
Author: jerome Date: Thu Mar 23 15:21:03 2006 New Revision: 388293 URL: http://svn.apache.org/viewcvs?rev=388293view=rev Log: Set the configuration of the parser used in the main method to fix NPEs Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=388293r1=388292r2=388293view=diff == --- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Thu Mar 23 15:21:03 2006 @@ -269,9 +269,11 @@ byte[] bytes = new byte[(int)file.length()]; DataInputStream in = new DataInputStream(new FileInputStream(file)); in.readFully(bytes); -Parse parse = new HtmlParser().getParse( -new Content(url, url, bytes, text/html, new Metadata(), -NutchConfiguration.create())); +Configuration conf = NutchConfiguration.create(); +HtmlParser parser = new HtmlParser(); +parser.setConf(conf); +Parse parse = parser.getParse( +new Content(url, url, bytes, text/html, new Metadata(), conf)); System.out.println(data: +parse.getData()); System.out.println(text: +parse.getText());
svn commit: r387650 - in /lucene/nutch/trunk/src/plugin/urlfilter-regex: ./ sample/ src/java/org/apache/nutch/net/ src/test/ src/test/org/ src/test/org/apache/ src/test/org/apache/nutch/ src/test/org/
Author: jerome Date: Tue Mar 21 14:26:56 2006 New Revision: 387650 URL: http://svn.apache.org/viewcvs?rev=387650view=rev Log: urlfilter-regex now use the lib-regex-filter framework. Add some unit tests. Added: lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/ lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.rules lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.urls lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.urls lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.urls lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/ lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/ lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/ lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/ lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/net/ lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/net/TestRegexURLFilter.java (with props) Modified: lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java Modified: lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml?rev=387650r1=387649r2=387650view=diff == --- lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml (original) +++ lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml Tue Mar 21 14:26:56 2006 @@ -4,4 +4,29 @@ import file=../build-plugin.xml/ + !-- Build compilation dependencies -- + target name=deps-jar +ant target=jar inheritall=false dir=../lib-regex-filter/ +ant target=compile-test inheritall=false dir=../lib-regex-filter/ + /target + + !-- Add compilation dependencies to classpath -- + path id=plugin.deps +fileset dir=${nutch.root}/build + include name=**/lib-regex-filter/*.jar / +/fileset +pathelement location=${nutch.root}/build/lib-regex-filter/test/ + /path + + !-- Deploy Unit test dependencies -- + target name=deps-test +ant target=deploy inheritall=false dir=../lib-regex-filter/ + /target + + !-- for junit test -- + mkdir dir=${build.test}/data/ + copy todir=${build.test}/data +fileset dir=sample includes=**/*.rules, **/*.urls/ + /copy + /project Added: lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.rules URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.rules?rev=387650view=auto == --- lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.rules (added) +++ lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.rules Tue Mar 21 14:26:56 2006 @@ -0,0 +1,26 @@ +# The url filter file used by the crawl command. + +# Better for intranet crawling. +# Be sure to change MY.DOMAIN.NAME to your domain name. + +# Each non-comment, non-blank line contains a regular expression +# prefixed by '+' or '-'. The first matching pattern in the file +# determines whether a URL is included or ignored. If no pattern +# matches, the URL is ignored. + +# skip file:, ftp:, mailto: urls +-^(file|ftp|mailto): + +# skip image and other suffixes we can't yet parse +-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$ + +# skip URLs containing certain characters as probable queries, etc. [EMAIL PROTECTED] + +# skip .fr .org and .net domains +-^.*//.*\.fr/ +-^.*//.*\.org/ +-^.*//.*\.net/ + +# skip everything else ++. Added: lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.urls URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.urls?rev=387650view=auto == --- lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.urls (added) +++ lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.urls Tue Mar 21 14:26:56 2006 @@ -0,0 +1,297 @@ ++http://www.hostip.info/ +-http://www.elanceur.org/Articles/OntologieSurfaite.html ++http://www.opensymphony.com/quartz/ +-http://www.portletbridge.org/saxbenchmark/index.html ++http://www.lesmotsdelinfo.com/ ++http://usefulinc.com/doap/ ++http://www.codezoo.com/ ++http://search.infocious.com/ +-http://pedagogie.ac-montpellier.fr/disciplines/anglais/tice/sms.html ++http://www.brics.dk/%7Eamoeller/automaton/ ++http://jazzz.com/wp.html ++http://www.maxkiesler.com/index.php ++http://adscriptum.blogspot.com/2006/03/google-et-la-prsentation-deric-schmidt.html ++http://www.alias-i.com/lingpipe/ +-http
svn commit: r387651 - in /lucene/nutch/trunk/src/plugin/urlfilter-automaton: ./ lib/ sample/ src/ src/java/ src/java/org/ src/java/org/apache/ src/java/org/apache/nutch/ src/java/org/apache/nutch/net/
Author: jerome Date: Tue Mar 21 14:29:18 2006 New Revision: 387651 URL: http://svn.apache.org/viewcvs?rev=387651view=rev Log: Add an urlfilter based on dk.brics.automaton. Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/ lucene/nutch/trunk/src/plugin/urlfilter-automaton/build.xml (with props) lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/ lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.jar (with props) lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml (with props) lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/ lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/Benchmarks.rules lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/Benchmarks.urls lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/IntranetCrawling.rules lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/IntranetCrawling.urls lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.rules lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.urls lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/ lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/ lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/ lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/ lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/ lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/net/ lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/net/AutomatonURLFilter.java (with props) lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/net/package.html (with props) lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/ lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/ lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/ lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/ lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/net/ lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/net/TestAutomatonURLFilter.java (with props) Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/build.xml?rev=387651view=auto == --- lucene/nutch/trunk/src/plugin/urlfilter-automaton/build.xml (added) +++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/build.xml Tue Mar 21 14:29:18 2006 @@ -0,0 +1,32 @@ +?xml version=1.0? + +project name=urlfilter-automaton default=jar-core + + import file=../build-plugin.xml/ + + !-- Build compilation dependencies -- + target name=deps-jar +ant target=jar inheritall=false dir=../lib-regex-filter/ +ant target=compile-test inheritall=false dir=../lib-regex-filter/ + /target + + !-- Add compilation dependencies to classpath -- + path id=plugin.deps +fileset dir=${nutch.root}/build + include name=**/lib-regex-filter/*.jar / +/fileset +pathelement location=${nutch.root}/build/lib-regex-filter/test/ + /path + + !-- Deploy Unit test dependencies -- + target name=deps-test +ant target=deploy inheritall=false dir=../lib-regex-filter/ + /target + + !-- for junit test -- + mkdir dir=${build.test}/data/ + copy todir=${build.test}/data +fileset dir=sample includes=**/*.rules, **/*.urls/ + /copy + +/project Propchange: lucene/nutch/trunk/src/plugin/urlfilter-automaton/build.xml -- svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.jar?rev=387651view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml?rev=387651view=auto == --- lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml (added) +++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml Tue Mar 21 14:29:18 2006 @@ -0,0 +1,25 @@ +?xml version=1.0 encoding=UTF-8? +plugin + id=urlfilter-automaton + name=Automaton URL Filter + version=1.0.0 + provider-name=nutch.org + + runtime + library name=urlfilter-automaton.jar + export name=*/ + /library
svn commit: r387659 - in /lucene/nutch/trunk/src/plugin: urlfilter-automaton/plugin.xml urlfilter-regex/plugin.xml
Author: jerome Date: Tue Mar 21 14:49:54 2006 New Revision: 387659 URL: http://svn.apache.org/viewcvs?rev=387659view=rev Log: Add some missing runtime dependencies Modified: lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml Modified: lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml?rev=387659r1=387658r2=387659view=diff == --- lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml Tue Mar 21 14:49:54 2006 @@ -9,10 +9,12 @@ library name=urlfilter-automaton.jar export name=*/ /library + library name=automaton.jar/ /runtime requires import plugin=nutch-extensionpoints/ + import plugin=lib-regex-filter/ /requires extension id=org.apache.nutch.net.urlfilter Modified: lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml?rev=387659r1=387658r2=387659view=diff == --- lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml Tue Mar 21 14:49:54 2006 @@ -13,6 +13,7 @@ requires import plugin=nutch-extensionpoints/ + import plugin=lib-regex-filter/ /requires extension id=org.apache.nutch.net.urlfilter
svn commit: r385702 - /lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java
Author: jerome Date: Mon Mar 13 16:00:38 2006 New Revision: 385702 URL: http://svn.apache.org/viewcvs?rev=385702view=rev Log: Fix NPE if lang is null Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java?rev=385702r1=385701r2=385702view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java Mon Mar 13 16:00:38 2006 @@ -78,6 +78,7 @@ private Extension getExtension(String lang) { +if (lang == null) { return null; } Extension extension = (Extension) this.conf.getObject(lang); if (extension == null) { extension = findExtension(lang);
svn commit: r385267 - /lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml
Author: jerome Date: Sun Mar 12 01:41:13 2006 New Revision: 385267 URL: http://svn.apache.org/viewcvs?rev=385267view=rev Log: NUTCH-228, clustering plugin descriptor fixed (Dawid Weiss) Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml?rev=385267r1=385266r2=385267view=diff == --- lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml Sun Mar 12 01:41:13 2006 @@ -18,7 +18,7 @@ library name=carrot2-util-tokenizer.jar/ library name=colt-1.0.3.jar/ - library name=commons-collections-3.0.jar/ + library name=commons-collections-3.1-patched.jar/ library name=commons-pool-1.1.jar/ library name=FSA.jar/ library name=Jama-1.0.1-patched.jar/
svn commit: r385268 - /lucene/nutch/trunk/src/web/jsp/cluster.jsp
Author: jerome Date: Sun Mar 12 01:42:44 2006 New Revision: 385268 URL: http://svn.apache.org/viewcvs?rev=385268view=rev Log: Fix milliseconds dumped by cluster.jsp (Dawid Weiss) Modified: lucene/nutch/trunk/src/web/jsp/cluster.jsp Modified: lucene/nutch/trunk/src/web/jsp/cluster.jsp URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/web/jsp/cluster.jsp?rev=385268r1=385267r2=385268view=diff == --- lucene/nutch/trunk/src/web/jsp/cluster.jsp (original) +++ lucene/nutch/trunk/src/web/jsp/cluster.jsp Sun Mar 12 01:42:44 2006 @@ -21,11 +21,11 @@ // cluster the hits HitsCluster [] clusters = null; if (clusterer != null) { - long clusteringStart = System.currentTimeMillis(); + final long clusteringStart = System.currentTimeMillis(); try { clusters = clusterer.clusterHits( details, summaries ); - long clusteringDuration = start - System.currentTimeMillis(); - bean.LOG.info(Clustering took: + clusteringDuration + milliseconds.); +final long clusteringDuration = System.currentTimeMillis() - clusteringStart; +bean.LOG.info(Clustering took: + clusteringDuration + milliseconds.); } catch (Exception e) { // failed to do clustering (see below) }
svn commit: r382981 - in /lucene/nutch/trunk/src/plugin: ./ analysis-de/ analysis-fr/ clustering-carrot2/ creativecommons/ index-basic/ index-more/ languageidentifier/ lib-http/ lib-parsems/ microform
Author: jerome Date: Fri Mar 3 16:26:54 2006 New Revision: 382981 URL: http://svn.apache.org/viewcvs?rev=382981view=rev Log: Plugins now assumes that the core is already builded when building nutch Modified: lucene/nutch/trunk/src/plugin/analysis-de/build.xml lucene/nutch/trunk/src/plugin/analysis-fr/build.xml lucene/nutch/trunk/src/plugin/build-plugin.xml lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml lucene/nutch/trunk/src/plugin/creativecommons/build.xml lucene/nutch/trunk/src/plugin/index-basic/build.xml lucene/nutch/trunk/src/plugin/index-more/build.xml lucene/nutch/trunk/src/plugin/languageidentifier/build.xml lucene/nutch/trunk/src/plugin/lib-http/build.xml lucene/nutch/trunk/src/plugin/lib-parsems/build.xml lucene/nutch/trunk/src/plugin/microformats-reltag/build.xml lucene/nutch/trunk/src/plugin/ontology/build.xml lucene/nutch/trunk/src/plugin/parse-ext/build.xml lucene/nutch/trunk/src/plugin/parse-html/build.xml lucene/nutch/trunk/src/plugin/parse-js/build.xml lucene/nutch/trunk/src/plugin/parse-mp3/build.xml lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml lucene/nutch/trunk/src/plugin/parse-msword/build.xml lucene/nutch/trunk/src/plugin/parse-pdf/build.xml lucene/nutch/trunk/src/plugin/parse-rss/build.xml lucene/nutch/trunk/src/plugin/parse-swf/build.xml lucene/nutch/trunk/src/plugin/parse-text/build.xml lucene/nutch/trunk/src/plugin/parse-zip/build.xml lucene/nutch/trunk/src/plugin/protocol-file/build.xml lucene/nutch/trunk/src/plugin/protocol-ftp/build.xml lucene/nutch/trunk/src/plugin/protocol-http/build.xml lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml lucene/nutch/trunk/src/plugin/query-basic/build.xml lucene/nutch/trunk/src/plugin/query-more/build.xml lucene/nutch/trunk/src/plugin/query-site/build.xml lucene/nutch/trunk/src/plugin/query-url/build.xml lucene/nutch/trunk/src/plugin/urlfilter-prefix/build.xml lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml Modified: lucene/nutch/trunk/src/plugin/analysis-de/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/analysis-de/build.xml?rev=382981r1=382980r2=382981view=diff == --- lucene/nutch/trunk/src/plugin/analysis-de/build.xml (original) +++ lucene/nutch/trunk/src/plugin/analysis-de/build.xml Fri Mar 3 16:26:54 2006 @@ -1,12 +1,10 @@ ?xml version=1.0? -project name=analysis-de default=jar +project name=analysis-de default=jar-core import file=../build-plugin.xml/ - - !-- Build compilation dependencies -- + target name=deps-jar -ant target=compile-core inheritall=false dir=${nutch.root}/ ant target=jar inheritall=false dir=../lib-lucene-analyzers/ /target Modified: lucene/nutch/trunk/src/plugin/analysis-fr/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/analysis-fr/build.xml?rev=382981r1=382980r2=382981view=diff == --- lucene/nutch/trunk/src/plugin/analysis-fr/build.xml (original) +++ lucene/nutch/trunk/src/plugin/analysis-fr/build.xml Fri Mar 3 16:26:54 2006 @@ -1,12 +1,11 @@ ?xml version=1.0? -project name=analysis-fr default=jar +project name=analysis-fr default=jar-core import file=../build-plugin.xml/ !-- Build compilation dependencies -- target name=deps-jar -ant target=compile-core inheritall=false dir=${nutch.root}/ ant target=jar inheritall=false dir=../lib-lucene-analyzers/ /target Modified: lucene/nutch/trunk/src/plugin/build-plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build-plugin.xml?rev=382981r1=382980r2=382981view=diff == --- lucene/nutch/trunk/src/plugin/build-plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/build-plugin.xml Fri Mar 3 16:26:54 2006 @@ -96,9 +96,14 @@ source=${javac.version} deprecation=${javac.deprecation} classpath refid=classpath/ -/javac +/javac /target + target name=compile-core +ant target=compile-core inheritall=false dir=${nutch.root}/ +ant target=compile/ + /target + !-- == -- !-- Make plugin .jar -- !-- == -- @@ -109,6 +114,13 @@ jarfile=${build.dir}/${name}.jar basedir=${build.classes} / + /target + + target name=jar-core depends=compile-core +jar +jarfile=${build.dir}/${name}.jar +basedir=${build.classes} +/ /target
svn commit: r382535 - in /lucene/nutch/trunk: conf/nutch-default.xml src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java src/plugin/protocol-ftp/src/java/org/apache/nutc
Author: jerome Date: Thu Mar 2 14:38:40 2006 New Revision: 382535 URL: http://svn.apache.org/viewcvs?rev=382535view=rev Log: Fix content.limit inconsistency in http, ftp and file Modified: lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=382535r1=382534r2=382535view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Thu Mar 2 14:38:40 2006 @@ -13,8 +13,8 @@ namefile.content.limit/name value65536/value descriptionThe length limit for downloaded content, in bytes. - If this value is larger than zero, content longer than it will be - truncated; otherwise (zero or negative), no truncation at all. + If this value is nonnegative (=0), content longer than it will be truncated; + otherwise, no truncation at all. /description /property @@ -150,11 +150,11 @@ nameftp.content.limit/name value65536/value descriptionThe length limit for downloaded content, in bytes. - If this value is larger than zero, content longer than it is truncated; - otherwise (zero or negative), no truncation at all. Caution: classical - ftp RFCs never defines partial transfer and, in fact, some ftp servers - out there do not handle client side forced close-down very well. - Our implementation tries its best to handle such situations smoothly. + If this value is nonnegative (=0), content longer than it will be truncated; + otherwise, no truncation at all. + Caution: classical ftp RFCs never defines partial transfer and, in fact, + some ftp servers out there do not handle client side forced close-down very + well. Our implementation tries its best to handle such situations smoothly. /description /property Modified: lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=382535r1=382534r2=382535view=diff == --- lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java Thu Mar 2 14:38:40 2006 @@ -167,7 +167,7 @@ // capture content int len = (int) size; -if (this.file.maxContentLength 0 len this.file.maxContentLength) +if (this.file.maxContentLength = 0 len this.file.maxContentLength) len = this.file.maxContentLength; this.content = new byte[len]; Modified: lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java?rev=382535r1=382534r2=382535view=diff == --- lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java Thu Mar 2 14:38:40 2006 @@ -344,9 +344,9 @@ } entries.add(ftpFile); count += line.length(); -// impose download limit if limit 0, otherwise no limit +// impose download limit if limit = 0, otherwise no limit // here, cut off is up to the line when total bytes is just over limit -if (limit 0 count limit) { +if (limit = 0 count limit) { mandatory_close = true; break; } @@ -409,9 +409,9 @@ new byte[org.apache.commons.net.io.Util.DEFAULT_COPY_BUFFER_SIZE]; while((len=input.read(buf,0,buf.length)) != -1){ count += len; -// impose download limit if limit 0, otherwise no limit +// impose download limit if limit = 0, otherwise no limit // here, cut off is exactly of limit bytes -if (limit 0 count limit) { +if (limit = 0 count limit) { os.write(buf,0,len-(count-limit)); mandatory_close = true; break;
svn commit: r379403 - in /lucene/nutch/trunk: conf/ src/java/org/apache/nutch/parse/ src/plugin/creativecommons/src/test/org/creativecommons/nutch/ src/plugin/languageidentifier/src/java/org/apache/nu
Author: jerome Date: Tue Feb 21 01:54:21 2006 New Revision: 379403 URL: http://svn.apache.org/viewcvs?rev=379403view=rev Log: NUTCH-140, parse-plugin.xml can now use extension-id and plugin-id Modified: lucene/nutch/trunk/conf/parse-plugins.dtd lucene/nutch/trunk/conf/parse-plugins.xml lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml Modified: lucene/nutch/trunk/conf/parse-plugins.dtd URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.dtd?rev=379403r1=379402r2=379403view=diff == --- lucene/nutch/trunk/conf/parse-plugins.dtd (original) +++ lucene/nutch/trunk/conf/parse-plugins.dtd Tue Feb 21 01:54:21 2006 @@ -1,7 +1,12 @@ -!ELEMENT parse-plugins (mimeType+) +!ELEMENT parse-plugins (mimeType+,aliases) !ELEMENT mimeType (plugin+) !ATTLIST mimeType name CDATA #REQUIRED !ELEMENT plugin EMPTY !ATTLIST plugin id CDATA #REQUIRED -!ATTLIST plugin order CDATA '' \ No newline at end of file +!ATTLIST plugin order CDATA '' + +!ELEMENT aliases (alias+) +!ELEMENT alias EMPTY +!ATTLIST alias name CDATA #REQUIRED +!ATTLIST alias extension-id CDATA #REQUIRED \ No newline at end of file Modified: lucene/nutch/trunk/conf/parse-plugins.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.xml?rev=379403r1=379402r2=379403view=diff == --- lucene/nutch/trunk/conf/parse-plugins.xml (original) +++ lucene/nutch/trunk/conf/parse-plugins.xml Tue Feb 21 01:54:21 2006 @@ -218,4 +218,33 @@ plugin id=parse-ext / /mimeType + !-- alias mappings for parse-xxx names to the actual extension implementation + ids described in each plugin's plugin.xml file -- + aliases + alias name=parse-ext extension-id=ExtParser / + alias name=parse-html + extension-id=org.apache.nutch.parse.html.HtmlParser / + alias name=parse-js extension-id=JSParser / + alias name=parse-mp3 + extension-id=org.apache.nutch.parse.mp3.MP3Parser / + alias name=parse-msexcel + extension-id=org.apache.nutch.parse.msexcel.MSExcelParser / + alias name=parse-mspowerpoint + extension-id=org.apache.nutch.parse.mspowerpoint.MSPowerPointParser / + alias name=parse-msword + extension-id=org.apache.nutch.parse.msword.MSWordParser / + alias name=parse-pdf + extension-id=org.apache.nutch.parse.pdf.PdfParser / + alias name=parse-rss + extension-id=org.apache.nutch.parse.rss.RSSParser / + alias name=parse-rtf + extension-id=org.apache.nutch.parse.rtf.RTFParseFactory / + alias name=parse-swf + extension-id=org.apache.nutch.parse.swf.SWFParser / + alias name=parse-text + extension-id=org.apache.nutch.parse.text.TextParser / + alias name=parse-zip + extension-id=org.apache.nutch.parse.zip.ZipParser / + /aliases + /parse-plugins
svn commit: r379419 - in /lucene/nutch/trunk: site/mailing_lists.html site/mailing_lists.pdf src/site/src/documentation/content/xdocs/mailing_lists.xml
Author: jerome Date: Tue Feb 21 03:10:42 2006 New Revision: 379419 URL: http://svn.apache.org/viewcvs?rev=379419view=rev Log: NUTCH-214, Add a search mailing list archive link (Jake Vanderdray) Modified: lucene/nutch/trunk/site/mailing_lists.html lucene/nutch/trunk/site/mailing_lists.pdf lucene/nutch/trunk/src/site/src/documentation/content/xdocs/mailing_lists.xml Modified: lucene/nutch/trunk/site/mailing_lists.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/site/mailing_lists.html?rev=379419r1=379418r2=379419view=diff == --- lucene/nutch/trunk/site/mailing_lists.html (original) +++ lucene/nutch/trunk/site/mailing_lists.html Tue Feb 21 03:10:42 2006 @@ -3,7 +3,7 @@ head META http-equiv=Content-Type content=text/html; charset=UTF-8 meta content=Apache Forrest name=Generator -meta name=Forrest-version content=0.6 +meta name=Forrest-version content=0.7 meta name=Forrest-skin-name content=pelt titleNutch Mailing Lists/title link type=text/css href=skin/basic.css rel=stylesheet @@ -16,49 +16,22 @@ body onload=init() script type=text/javascriptndeSetTextSize();/script div id=top -!--+ -|breadtrail -+-- div class=breadtrail a href=http://www.apache.org/;Apache/a gt; a href=http://lucene.apache.org/;Lucene/a gt; a href=http://lucene.apache.org/nutch/;Nutch/ascript src=skin/breadcrumbs.js language=JavaScript type=text/javascript/script /div -!--+ -|header -+-- div class=header -!--+ -|start group logo -+-- div class=grouplogo a href=http://lucene.apache.org/;img class=logoImage alt=Lucene src=http://lucene.apache.org/java/docs/images/lucene_green_150.gif; title=Apache Lucene/a /div -!--+ -|end group logo -+-- -!--+ -|start Project Logo -+-- div class=projectlogo a href=http://lucene.apache.org/nutch/;img class=logoImage alt=Nutch src=images/nutch-logo.gif title=Open Source Web Search Software/a /div -!--+ -|end Project Logo -+-- -!--+ -|start Search -+-- div class=searchbox form action=http://www.google.com/search; method=get class=roundtopsmall -input value=lucene.apache.org name=sitesearch type=hiddeninput onFocus=getBlank (this, 'Search the site with google:'); value=Search the site with google: size=25 name=q id=query type=textnbsp; -input name=Search value=Search type=submit +input value=lucene.apache.org name=sitesearch type=hiddeninput onFocus=getBlank (this, 'Search the site with google'); size=25 name=q id=query type=text value=Search the site with googlenbsp; +input attr=value name=Search value=Search type=submit /form /div -!--+ -|end search -+-- -!--+ -|start Tabs -+-- ul id=tabs li class=current a class=base-selected href=index.htmlMain/a @@ -67,110 +40,83 @@ a class=base-not-selected href=http://wiki.apache.org/nutch/;Wiki/a /li /ul -!--+ -|end Tabs -+-- /div /div div id=main div id=publishedStrip -!--+ -|start Subtabs -+-- div id=level2tabs/div -!--+ -|end Endtabs -+-- -script type=text/javascript language=JavaScript!-- - document.write(Published: + document.lastModified); - // --/script -/div -!--+ -|breadtrail -+-- +script type=text/javascript!-- +document.write(textLast Published:/text + document.lastModified); +// --/script +/div div class=breadtrail nbsp; /div -!--+ -|start Menu, mainarea -+-- -!--+ -|start Menu -+-- div id=menu div onclick=SwitchMenu('menu_1.1', 'skin/') id=menu_1.1Title class=menutitleProject/div div id=menu_1.1 class=menuitemgroup div class=menuitem -a title= href=index.htmlNews/a +a href=index.htmlNews/a /div div class=menuitem -a title= href=about.htmlAbout/a +a href=about.htmlAbout/a /div div class=menuitem -a title= href=credits.htmlCredits/a +a href=credits.htmlCredits/a /div div class=menuitem -a title= href=http://www.cafepress.com/nutch/;Buy Stuff/a +a href=http://www.cafepress.com/nutch/;Buy Stuff/a /div /div div onclick=SwitchMenu('menu_1.2', 'skin/') id=menu_1.2Title class=menutitleDocumentation/div div id=menu_1.2 class=menuitemgroup div class=menuitem -a title= href=http://wiki.apache.org/nutch/FAQ;FAQ/a +a href=http://wiki.apache.org/nutch/FAQ;FAQ/a /div div class=menuitem -a title= href=http://wiki.apache.org/nutch/;Wiki/a +a href=http://wiki.apache.org/nutch/;Wiki/a /div div class=menuitem -a title= href=tutorial.htmlTutorial/a +a href=tutorial.htmlTutorial/a /div div class=menuitem -a title= href=bot.htmlRobot /a +a href=bot.htmlRobot /a /div div class=menuitem -a title= href=i18n.htmli18n/a +a href=i18n.htmli18n/a /div div class=menuitem -a title= href=apidocs/index.htmlAPI Docs/a +a href=apidocs/index.htmlAPI Docs/a /div /div div onclick=SwitchMenu('menu_selected_1.3', 'skin/') id=menu_selected_1.3Title class=menutitle style=background-image
svn commit: r379511 - in /lucene/nutch/trunk: docs/fr/help.html src/web/pages/fr/help.xml
Author: jerome Date: Tue Feb 21 08:05:07 2006 New Revision: 379511 URL: http://svn.apache.org/viewcvs?rev=379511view=rev Log: Add fr help page Added: lucene/nutch/trunk/docs/fr/help.html (with props) lucene/nutch/trunk/src/web/pages/fr/help.xml (with props) Added: lucene/nutch/trunk/docs/fr/help.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/docs/fr/help.html?rev=379511view=auto == --- lucene/nutch/trunk/docs/fr/help.html (added) +++ lucene/nutch/trunk/docs/fr/help.html Tue Feb 21 08:05:07 2006 @@ -0,0 +1,161 @@ +!DOCTYPE HTML PUBLIC -//W3C//DTD HTML 4.01 Transitional//EN +html!--This page is automatically generated. Do not edit!-- +head +META http-equiv=Content-Type content=text/html; charset=UTF-8 +titleNutch: aide/title +style type=text/css +.menuTd {background-color: #F9F7F4; height: 25; onMouseOver: this.style.backgroundColor='#ECE5DC';} +.menuTdhover {background-color: #ECE5DC; height: 25; onMouseOver: this.style.backgroundColor='#ECE5DC';} +.menuEntry { font-family: Arial, Helvetica, sans-serif; font-size: 12px; color: #00; text-decoration: none} +.body {background-color: #F9F7F4;} +.bodytext { font-family: Arial, Helvetica, sans-serif; font-size: 12px; color: #00; text-decoration: none} +.title { font-family: Arial, Helvetica, sans-serif; font-size: 26px; color: #FF9900; text-decoration: none} +.intro { font-family: Arial, Helvetica, sans-serif; font-size: 12px; color: #FF9900; text-decoration: none} +.orangeTd {background-color: #FF9900} +ul {list-style-image: url(../img/reiter/ul.gif)} +h3 {font-family: Arial, Helvetica, sans-serif; font-size: 16px; color: #00;} +h4 {font-family: Arial, Helvetica, sans-serif; font-size: 14px; color: #00;} +.url {color: #996600;} +.highlight {font-weight: bold;} +.ellipsis {font-weight: bold;} +/style +link type=image/x-icon href=../img/favicon.ico rel=icon +link type=image/x-icon href=../img/favicon.ico rel=shortcut icon +script type=text/javascript +!-- +function queryfocus() { document.search.query.focus(); } +// --/script +/head +body onLoad=queryfocus(); +!--This file is automatically generated. Do not edit!-- +table cellspacing=0 cellpadding=0 border=0 width=635 +tr +td rowspan=2 width=140 valign=bottoma href=./img border=0 src=../img/reiter/logo_nutch.gif/aimg height=1 width=140 src=../img/reiter/spacer_66.gif/td +/tr +tr +td align=right valign=bottom width=495 +table width=495 cellspacing=0 cellpadding=0 border=0 +tr +td width=400 background=../img/reiter/_bg_reiter.gifnbsp;/tdtd width=10 valign=bottom height=28img border=0 src=../img/reiter/reiter_inactive_le1.gif/tdtd nowrap=nowrap valign=bottom background=../img/reiter/_bg_reiter_inactive.gifa href=about.html class=bodytextA propos/a/tdtd width=10 valign=bottom height=28img border=0 src=../img/reiter/reiter_inactive_ri.gif/tdtd width=10 valign=bottom height=28img border=0 src=../img/reiter/reiter_inactive_le.gif/tdtd nowrap=nowrap valign=bottom background=../img/reiter/_bg_reiter_inactive.gifa href=http://wiki.apache.org/nutch/FAQ; class=bodytextQuestions freacute;quentes/a/tdtd width=10 valign=bottom height=28img border=0 src=../img/reiter/reiter_inactive_ri.gif/td +/tr +/table +/td +/tr +/table +table cellspacing=0 cellpadding=0 border=0 width=635 +tr valign=top +td width=140 +table cellspacing=0 cellpadding=0 width=100% +tr +td#160;/td +/tr +/table +/tdtd background=../img/reiter/_spacer_cc.gif width=20#160;/tdtd class=body width=475 +table cellspacing=0 cellpadding=0 border=0 width=475 +tr +td valign=bottom width=275 height=125 class=titleaide/tdtd valign=bottom width=200 height=125img src=../img/reiter/robots.gif/td +/tr +/table +br class=br +span class=bodytext + +/spanspan class=bodytext +h3Requecirc;tes/h3 +/spanspan class=bodytext +Pour effectuer une recherche avec Nutch, il suffit de saisir quelques mots. +/spanspan class=bodytext +ul + +liLes reacute;sultats contiendront uniquement les pages qui contiennent +span style=font-style: italic;tous/span les mots de la question./li + +liUtilisez les doubles chevrons autour des termes qui doivent ecirc;tre adjacents, +comme dans le cas d'une phrase. Par exemple span style=font-weight: bold; +Nouvelle Zeacute;lande/span./li + +liLa ponctuation entre les mots est ignoreacute;e. Ainsi, la recherche +span style=font-weight: bold;http://www.nutch.org//span est eacute;quivalente agrave; +span style=font-weight: bold;http www nutch org/span./li + +liLa recherche n'est pas sensible agrave; la casse. Ainsi, la recherche +span style=font-weight: bold;NuTcH/span est eacute;quivalente agrave; span style=font-weight: bold;nUtCh/span./li + +liVous pouvez exclure un mot des reacute;sultats en placcedil;ant un signe moins +devant. Ainsi, la recherche span style=font-weight: bold;football +-nfl/span retournera les pages qui parlent de football, mais qui ne contiennent +pas le mot nfl./li + +/ul
svn commit: r378653 - in /lucene/nutch/trunk/src/plugin/parse-rtf/src: java/org/apache/nutch/parse/rtf/RTFParseFactory.java java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java test/org/apache/n
Author: jerome Date: Fri Feb 17 15:22:55 2006 New Revision: 378653 URL: http://svn.apache.org/viewcvs?rev=378653view=rev Log: Adapt parse-rtf to nutch APIs changes (metadata, parse, protocol, ...) Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java?rev=378653r1=378652r2=378653view=diff == --- lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java (original) +++ lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java Fri Feb 17 15:22:55 2006 @@ -13,38 +13,42 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.nutch.parse.rtf; -import org.apache.nutch.parse.*; -import org.apache.nutch.protocol.Content; - .mine - .mine -import org.apache.nutch.util.MetadataNames; - -=== -import org.apache.nutch.util.NutchConf; -=== -import org.apache.hadoop.conf.Configuration; - .r374853 - .r373941 +// JDK imports import java.io.ByteArrayInputStream; import java.io.InputStreamReader; import java.io.Reader; -import java.util.Properties; +// Hadoop imports +import org.apache.hadoop.conf.Configuration; + +// Nutch imports +import org.apache.nutch.metadata.DublinCore; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.OutlinkExtractor; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.protocol.Content; + +// RTF Parser imports +import com.etranslate.tm.processing.rtf.ParseException; import com.etranslate.tm.processing.rtf.RTFParser; + /** * A parser for RTF documents * * @author Andy Hedges */ -public class RTFParseFactory implements Parser, MetadataNames { +public class RTFParseFactory implements Parser { private Configuration conf; - public Parse getParse(Content content) throws ParseException { + public Parse getParse(Content content) { byte[] raw = content.getContent(); Reader reader = new InputStreamReader(new ByteArrayInputStream(raw)); RTFParserDelegateImpl delegate = new RTFParserDelegateImpl(); @@ -55,28 +59,31 @@ try { rtfParser.parse(); -} catch (com.etranslate.tm.processing.rtf.ParseException e) { - throw new ParseException(Exception parsing RTF document, e); +} catch (ParseException e) { +return new ParseStatus(ParseStatus.FAILED, + ParseStatus.FAILED_EXCEPTION, + e.toString()).getEmptyParse(conf); } -Properties metadata = new Properties(); -metadata.putAll(content.getMetadata()); -metadata.putAll(delegate.getMetaData()); -String title = metadata.getProperty(TITLE); +Metadata metadata = new Metadata(); +metadata.setAll(delegate.getMetaData()); +String title = metadata.get(DublinCore.TITLE); if (title != null) { -//(CM): Why remove the title metadata property here? Even -//though it's stored in the ParseData, it still might be useful -//to have via this properties object? -//metadata.remove(title); + metadata.remove(DublinCore.TITLE); } else { title = ; } String text = delegate.getText(); -return new ParseImpl(text, new ParseData(title, OutlinkExtractor -.getOutlinks(text, this.conf), metadata)); +return new ParseImpl(text, + new ParseData(ParseStatus.STATUS_SUCCESS, + title, + OutlinkExtractor +. getOutlinks(text, this.conf), + content.getMetadata(), + metadata)); } public void setConf(Configuration conf) { Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java?rev=378653r1=378652r2=378653view=diff == --- lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java (original
svn commit: r378655 - in /lucene/nutch/trunk/src/plugin: ./ analysis-de/ analysis-fr/ clustering-carrot2/ creativecommons/ index-basic/ index-more/ languageidentifier/ lib-commons-httpclient/ lib-http
Author: jerome Date: Fri Feb 17 15:28:39 2006 New Revision: 378655 URL: http://svn.apache.org/viewcvs?rev=378655view=rev Log: Review plugins building and testing Modified: lucene/nutch/trunk/src/plugin/analysis-de/build.xml lucene/nutch/trunk/src/plugin/analysis-fr/build.xml lucene/nutch/trunk/src/plugin/build-plugin.xml lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml lucene/nutch/trunk/src/plugin/creativecommons/build.xml lucene/nutch/trunk/src/plugin/index-basic/build.xml lucene/nutch/trunk/src/plugin/index-more/build.xml lucene/nutch/trunk/src/plugin/languageidentifier/build.xml lucene/nutch/trunk/src/plugin/lib-commons-httpclient/build.xml lucene/nutch/trunk/src/plugin/lib-http/build.xml lucene/nutch/trunk/src/plugin/lib-jakarta-poi/build.xml lucene/nutch/trunk/src/plugin/lib-log4j/build.xml lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/build.xml lucene/nutch/trunk/src/plugin/lib-nekohtml/build.xml lucene/nutch/trunk/src/plugin/lib-parsems/build.xml lucene/nutch/trunk/src/plugin/ontology/build.xml lucene/nutch/trunk/src/plugin/parse-ext/build.xml lucene/nutch/trunk/src/plugin/parse-html/build.xml lucene/nutch/trunk/src/plugin/parse-js/build.xml lucene/nutch/trunk/src/plugin/parse-mp3/build.xml lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml lucene/nutch/trunk/src/plugin/parse-msword/build.xml lucene/nutch/trunk/src/plugin/parse-pdf/build.xml lucene/nutch/trunk/src/plugin/parse-rss/build.xml lucene/nutch/trunk/src/plugin/parse-rtf/build.xml lucene/nutch/trunk/src/plugin/parse-swf/build.xml lucene/nutch/trunk/src/plugin/parse-text/build.xml lucene/nutch/trunk/src/plugin/parse-zip/build.xml lucene/nutch/trunk/src/plugin/protocol-file/build.xml lucene/nutch/trunk/src/plugin/protocol-ftp/build.xml lucene/nutch/trunk/src/plugin/protocol-http/build.xml lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml lucene/nutch/trunk/src/plugin/query-basic/build.xml lucene/nutch/trunk/src/plugin/query-more/build.xml lucene/nutch/trunk/src/plugin/query-site/build.xml lucene/nutch/trunk/src/plugin/query-url/build.xml lucene/nutch/trunk/src/plugin/urlfilter-prefix/build.xml lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml Modified: lucene/nutch/trunk/src/plugin/analysis-de/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/analysis-de/build.xml?rev=378655r1=378654r2=378655view=diff == --- lucene/nutch/trunk/src/plugin/analysis-de/build.xml (original) +++ lucene/nutch/trunk/src/plugin/analysis-de/build.xml Fri Feb 17 15:28:39 2006 @@ -4,9 +4,16 @@ import file=../build-plugin.xml/ + !-- Build compilation dependencies -- + target name=deps-jar +ant target=compile-core inheritall=false dir=${nutch.root}/ +ant target=jar inheritall=false dir=../lib-lucene-analyzers/ + /target + + !-- Add compilation dependencies to classpath -- path id=plugin.deps -fileset dir=../lib-lucene-analyzers/lib - include name=*.jar / +fileset dir=${nutch.root}/build + include name=**/lib-lucene-analyzers/*.jar / /fileset /path Modified: lucene/nutch/trunk/src/plugin/analysis-fr/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/analysis-fr/build.xml?rev=378655r1=378654r2=378655view=diff == --- lucene/nutch/trunk/src/plugin/analysis-fr/build.xml (original) +++ lucene/nutch/trunk/src/plugin/analysis-fr/build.xml Fri Feb 17 15:28:39 2006 @@ -4,9 +4,16 @@ import file=../build-plugin.xml/ + !-- Build compilation dependencies -- + target name=deps-jar +ant target=compile-core inheritall=false dir=${nutch.root}/ +ant target=jar inheritall=false dir=../lib-lucene-analyzers/ + /target + + !-- Add compilation dependencies to classpath -- path id=plugin.deps -fileset dir=../lib-lucene-analyzers/lib - include name=*.jar / +fileset dir=${nutch.root}/build + include name=**/lib-lucene-analyzers/*.jar / /fileset /path Modified: lucene/nutch/trunk/src/plugin/build-plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build-plugin.xml?rev=378655r1=378654r2=378655view=diff == --- lucene/nutch/trunk/src/plugin/build-plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/build-plugin.xml Fri Feb 17 15:28:39 2006 @@ -68,10 +68,22 @@ !-- to be overridden by sub-projects -- target name=init-plugin/ + !-- + ! Used to build plugin compilation dependencies + ! (to be overridden by plugins) + !-- + target name=deps-jar/ + + !-- + ! Used to deploy plugin runtime
svn commit: r378667 - in /lucene/nutch/trunk/src/plugin/parse-mp3/src: java/org/apache/nutch/parse/mp3/MP3Parser.java java/org/apache/nutch/parse/mp3/MetadataCollector.java test/org/apache/nutch/parse
Author: jerome Date: Fri Feb 17 16:23:35 2006 New Revision: 378667 URL: http://svn.apache.org/viewcvs?rev=378667view=rev Log: Adapts parse-mp3 to nutch APIs changes (metadata, parse, protocol, ...) Modified: lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java Modified: lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java?rev=378667r1=378666r2=378667view=diff == --- lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java Fri Feb 17 16:23:35 2006 @@ -16,10 +16,14 @@ package org.apache.nutch.parse.mp3; +// JDK imports +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.Iterator; -import org.apache.nutch.parse.*; -import org.apache.nutch.protocol.Content; -import org.apache.hadoop.conf.Configuration; +// Java ID3 Tag imports import org.farng.mp3.MP3File; import org.farng.mp3.TagException; import org.farng.mp3.id3.AbstractID3v2; @@ -27,29 +31,35 @@ import org.farng.mp3.id3.ID3v1; import org.farng.mp3.object.AbstractMP3Object; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.net.MalformedURLException; -import java.util.Iterator; +// Hadoop imports +import org.apache.hadoop.conf.Configuration; + +// Nutch imports +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.protocol.Content; + /** * A parser for MP3 audio files * @author Andy Hedges */ - public class MP3Parser implements Parser { private MetadataCollector metadataCollector; private Configuration conf; - public Parse getParse(Content content) throws ParseException { -Parse parse = null; -metadataCollector.putAll(content.getMetadata()); + public Parse getParse(Content content) { +Parse parse = null; byte[] raw = content.getContent(); - File tmp = null; + try { tmp = File.createTempFile(nutch, .mp3); FileOutputStream fos = new FileOutputStream(tmp); @@ -58,25 +68,31 @@ MP3File mp3 = new MP3File(tmp); if (mp3.hasID3v2Tag()) { -parse = getID3v2Parse(mp3); +parse = getID3v2Parse(mp3, content.getMetadata()); } else if (mp3.hasID3v1Tag()) { -parse = getID3v1Parse(mp3); +parse = getID3v1Parse(mp3, content.getMetadata()); } else { -throw new ParseException(No textual content available); +return new ParseStatus(ParseStatus.FAILED, + ParseStatus.FAILED_MISSING_CONTENT, + No textual content available).getEmptyParse(conf); } - - } catch (IOException e) { - throw new ParseException(Couldn't create temporary file, e); + return new ParseStatus(ParseStatus.FAILED, + ParseStatus.FAILED_EXCEPTION, + Couldn't create temporary file: + e).getEmptyParse(conf); } catch (TagException e) { - throw new ParseException(ID3 Tags could not be parsed, e); + return new ParseStatus(ParseStatus.FAILED, + ParseStatus.FAILED_EXCEPTION, + ID3 Tags could not be parsed: + e).getEmptyParse(conf); } finally{ tmp.delete(); } return parse; } - private Parse getID3v1Parse(MP3File mp3) throws MalformedURLException { + private Parse getID3v1Parse(MP3File mp3, Metadata contentMeta) + throws MalformedURLException { + ID3v1 tag = mp3.getID3v1Tag(); metadataCollector.notifyProperty(TALB-Text, tag.getAlbum()); metadataCollector.notifyProperty(TPE1-Text, tag.getArtist()); @@ -84,13 +100,17 @@ metadataCollector.notifyProperty(TCON-Text, ( + tag.getGenre() + )); metadataCollector.notifyProperty(TIT2-Text, tag.getTitle()); metadataCollector.notifyProperty(TYER-Text, tag.getYear()); -ParseData parseData = new ParseData(metadataCollector.getTitle(), -metadataCollector.getOutlinks(), -metadataCollector.getData(), getConf()); +ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS
svn commit: r378214 - in /lucene/nutch/trunk/src/plugin: ./ clustering-carrot2/ lib-commons-httpclient/ lib-commons-httpclient/lib/ parse-rss/lib/ protocol-httpclient/ protocol-httpclient/lib/
Author: jerome Date: Thu Feb 16 02:10:23 2006 New Revision: 378214 URL: http://svn.apache.org/viewcvs?rev=378214view=rev Log: NUTCH-196 : add a httpclient library (lib-commons-httpclient) Added: lucene/nutch/trunk/src/plugin/lib-commons-httpclient/ lucene/nutch/trunk/src/plugin/lib-commons-httpclient/build.xml (with props) lucene/nutch/trunk/src/plugin/lib-commons-httpclient/lib/ lucene/nutch/trunk/src/plugin/lib-commons-httpclient/lib/commons-httpclient-3.0.jar (with props) lucene/nutch/trunk/src/plugin/lib-commons-httpclient/plugin.xml (with props) Removed: lucene/nutch/trunk/src/plugin/parse-rss/lib/commons-httpclient-3.0-beta1.jar lucene/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-httpclient-3.0.jar Modified: lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=378214r1=378213r2=378214view=diff == --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Thu Feb 16 02:10:23 2006 @@ -11,6 +11,7 @@ ant dir=index-basic target=deploy/ ant dir=index-more target=deploy/ ant dir=languageidentifier target=deploy/ + ant dir=lib-commons-httpclient target=deploy/ ant dir=lib-http target=deploy/ ant dir=lib-jakarta-poi target=deploy/ ant dir=lib-log4j target=deploy/ @@ -77,6 +78,7 @@ ant dir=index-basic target=clean/ ant dir=index-more target=clean/ ant dir=languageidentifier target=clean/ +ant dir=lib-commons-httpclient target=clean/ ant dir=lib-http target=clean/ ant dir=lib-jakarta-poi target=clean/ ant dir=lib-log4j target=clean/ Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml?rev=378214r1=378213r2=378214view=diff == --- lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml (original) +++ lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml Thu Feb 16 02:10:23 2006 @@ -5,8 +5,9 @@ import file=../build-plugin.xml/ path id=plugin.deps -fileset dir=../lib-log4j/lib - include name=*.jar / +fileset dir=${nutch.root}/build + include name=**/lib-log4j/*.jar / + include name=**/lib-commons-httpclient/*.jar / /fileset /path Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml?rev=378214r1=378213r2=378214view=diff == --- lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml Thu Feb 16 02:10:23 2006 @@ -30,6 +30,7 @@ requires import plugin=nutch-extensionpoints/ import plugin=lib-log4j/ + import plugin=lib-commons-httpclient/ /requires extension id=org.apache.nutch.clustering.carrot2 Added: lucene/nutch/trunk/src/plugin/lib-commons-httpclient/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-commons-httpclient/build.xml?rev=378214view=auto == --- lucene/nutch/trunk/src/plugin/lib-commons-httpclient/build.xml (added) +++ lucene/nutch/trunk/src/plugin/lib-commons-httpclient/build.xml Thu Feb 16 02:10:23 2006 @@ -0,0 +1,21 @@ +?xml version=1.0? + +project name=lib-commons-httpclient default=jar + + import file=../build-plugin.xml/ + + !-- + ! Override the compile and jar targets, + ! since there is nothing to compile here. + ! -- + target name=compile depends=init +echo message=Compiling plugin: ${name}/ + /target + + target name=jar depends=compile +copy todir=${build.dir} verbose=true + fileset dir=./lib includes=**/*.jar/ +/copy + /target + +/project Propchange: lucene/nutch/trunk/src/plugin/lib-commons-httpclient/build.xml -- svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/lib-commons-httpclient/lib/commons-httpclient-3.0.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-commons-httpclient/lib/commons-httpclient-3.0.jar?rev=378214view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk
svn commit: r378215 - /lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml
Author: jerome Date: Thu Feb 16 02:22:10 2006 New Revision: 378215 URL: http://svn.apache.org/viewcvs?rev=378215view=rev Log: remove the unused httpclient library declaration Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml?rev=378215r1=378214r2=378215view=diff == --- lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml Thu Feb 16 02:22:10 2006 @@ -10,7 +10,6 @@ export name=*/ /library library name=commons-codec.jar / - library name=commons-httpclient-3.0.jar / /runtime requires
svn commit: r378222 - /lucene/nutch/trunk/src/plugin/build.xml
Author: jerome Date: Thu Feb 16 02:51:14 2006 New Revision: 378222 URL: http://svn.apache.org/viewcvs?rev=378222view=rev Log: add lib-nekohtml to deploy and clean and force the lib plugins to be builded first Modified: lucene/nutch/trunk/src/plugin/build.xml Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=378222r1=378221r2=378222view=diff == --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Thu Feb 16 02:51:14 2006 @@ -6,17 +6,18 @@ !-- Build deploy all the plugin jars.-- !-- == -- target name=deploy - ant dir=clustering-carrot2 target=deploy/ - ant dir=creativecommons target=deploy/ - ant dir=index-basic target=deploy/ - ant dir=index-more target=deploy/ - ant dir=languageidentifier target=deploy/ ant dir=lib-commons-httpclient target=deploy/ ant dir=lib-http target=deploy/ ant dir=lib-jakarta-poi target=deploy/ ant dir=lib-log4j target=deploy/ ant dir=lib-lucene-analyzers target=deploy/ + ant dir=lib-nekohtml target=deploy/ ant dir=lib-parsems target=deploy/ + ant dir=clustering-carrot2 target=deploy/ + ant dir=creativecommons target=deploy/ + ant dir=index-basic target=deploy/ + ant dir=index-more target=deploy/ + ant dir=languageidentifier target=deploy/ ant dir=nutch-extensionpoints target=deploy/ ant dir=ontology target=deploy/ ant dir=protocol-file target=deploy/ @@ -83,6 +84,7 @@ ant dir=lib-jakarta-poi target=clean/ ant dir=lib-log4j target=clean/ ant dir=lib-lucene-analyzers target=clean/ +ant dir=lib-nekohtml target=clean/ ant dir=lib-parsems target=clean/ ant dir=nutch-extensionpoints target=clean/ ant dir=ontology target=clean/
svn commit: r378011 - in /lucene/nutch/trunk/src/plugin: ./ clustering-carrot2/ clustering-carrot2/lib/ lib-log4j/ lib-log4j/lib/ parse-pdf/ parse-pdf/lib/ parse-rss/ parse-rss/lib/
Author: jerome Date: Wed Feb 15 06:24:56 2006 New Revision: 378011 URL: http://svn.apache.org/viewcvs?rev=378011view=rev Log: Add a log4j library plugin (lib-log4j) Added: lucene/nutch/trunk/src/plugin/lib-log4j/ lucene/nutch/trunk/src/plugin/lib-log4j/build.xml (with props) lucene/nutch/trunk/src/plugin/lib-log4j/lib/ lucene/nutch/trunk/src/plugin/lib-log4j/lib/log4j-1.2.11.jar (with props) lucene/nutch/trunk/src/plugin/lib-log4j/plugin.xml (with props) Removed: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/log4j-1.2.11.jar lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/log4j.LICENSE lucene/nutch/trunk/src/plugin/parse-pdf/lib/log4j-1.2.9.jar lucene/nutch/trunk/src/plugin/parse-pdf/lib/log4j-LICENSE.txt lucene/nutch/trunk/src/plugin/parse-rss/lib/log4j-1.2.6.jar Modified: lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml lucene/nutch/trunk/src/plugin/parse-pdf/build.xml lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml lucene/nutch/trunk/src/plugin/parse-rss/build.xml lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=378011r1=378010r2=378011view=diff == --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Wed Feb 15 06:24:56 2006 @@ -13,6 +13,7 @@ ant dir=languageidentifier target=deploy/ ant dir=lib-http target=deploy/ ant dir=lib-jakarta-poi target=deploy/ + ant dir=lib-log4j target=deploy/ ant dir=lib-lucene-analyzers target=deploy/ ant dir=lib-parsems target=deploy/ ant dir=nutch-extensionpoints target=deploy/ @@ -78,6 +79,7 @@ ant dir=languageidentifier target=clean/ ant dir=lib-http target=clean/ ant dir=lib-jakarta-poi target=clean/ +ant dir=lib-log4j target=clean/ ant dir=lib-lucene-analyzers target=clean/ ant dir=lib-parsems target=clean/ ant dir=nutch-extensionpoints target=clean/ Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml?rev=378011r1=378010r2=378011view=diff == --- lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml (original) +++ lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml Wed Feb 15 06:24:56 2006 @@ -4,4 +4,10 @@ import file=../build-plugin.xml/ + path id=plugin.deps +fileset dir=../lib-log4j/lib + include name=*.jar / +/fileset + /path + /project Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml?rev=378011r1=378010r2=378011view=diff == --- lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml Wed Feb 15 06:24:56 2006 @@ -29,6 +29,7 @@ requires import plugin=nutch-extensionpoints/ + import plugin=lib-log4j/ /requires extension id=org.apache.nutch.clustering.carrot2 Added: lucene/nutch/trunk/src/plugin/lib-log4j/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-log4j/build.xml?rev=378011view=auto == --- lucene/nutch/trunk/src/plugin/lib-log4j/build.xml (added) +++ lucene/nutch/trunk/src/plugin/lib-log4j/build.xml Wed Feb 15 06:24:56 2006 @@ -0,0 +1,17 @@ +?xml version=1.0? + +project name=lib-log4j default=jar + + import file=../build-plugin.xml/ + + !-- + ! Override the compile and jar targets, + ! since there is nothing to compile here. + ! -- + target name=compile depends=init +echo message=Compiling plugin: ${name}/ + /target + + target name=jar depends=compile/ + +/project Propchange: lucene/nutch/trunk/src/plugin/lib-log4j/build.xml -- svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/lib-log4j/lib/log4j-1.2.11.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-log4j/lib/log4j-1.2.11.jar?rev=378011view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/lib-log4j/lib/log4j-1.2.11.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/lib-log4j/plugin.xml URL: http://svn.apache.org
svn commit: r377494 - in /lucene/nutch/trunk/src/plugin: parse-msexcel/ parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ parse-mspowerpoint/ parse-mspowerpoint/src/java/org/apache/nutch/parse/ms
Author: jerome Date: Mon Feb 13 13:28:13 2006 New Revision: 377494 URL: http://svn.apache.org/viewcvs?rev=377494view=rev Log: Make use of lib-parsems in word, powerpoint and excel parsers Removed: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html lucene/nutch/trunk/src/plugin/parse-msword/build.xml lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/FastSavedException.java lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/PasswordProtectedException.java lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/package.html Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml?rev=377494r1=377493r2=377494view=diff == --- lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml Mon Feb 13 13:28:13 2006 @@ -2,19 +2,23 @@ project name=parse-msexcel default=jar - import file=../build-plugin.xml / + import file=../build-plugin.xml / path id=plugin.deps fileset dir=../lib-jakarta-poi/lib include name=*.jar / /fileset +fileset dir=../../../build/lib-parsems + include name=*.jar / +/fileset /path - !-- for junit test -- - mkdir dir=${build.test}/data / - copy todir=${build.test}/data - fileset dir=sample - include name=*.xls / - /fileset - /copy + !-- for junit test -- + mkdir dir=${build.test}/data / + copy todir=${build.test}/data +fileset dir=sample + include name=*.xls / +/fileset + /copy + /project Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml?rev=377494r1=377493r2=377494view=diff == --- lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml Mon Feb 13 13:28:13 2006 @@ -14,6 +14,7 @@ requires import plugin=nutch-extensionpoints/ import plugin=lib-jakarta-poi/ + import plugin=lib-parsems/ /requires extension id=org.apache.nutch.parse.msexcel Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java?rev=377494r1=377493r2=377494view=diff == --- lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java Mon Feb 13 13:28:13 2006 @@ -16,17 +16,17 @@ package org.apache.nutch.parse.msexcel; // JDK imports -import java.io.IOException; import java.io.InputStream; -import java.util.Date; -import java.util.Properties; // Jakarta POI imports import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import
svn commit: r377501 - in /lucene/nutch/trunk: ./ src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/ src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/
Author: jerome Date: Mon Feb 13 13:43:15 2006 New Revision: 377501 URL: http://svn.apache.org/viewcvs?rev=377501view=rev Log: Javadoc updates for ms parsers Added: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/package.html (with props) Modified: lucene/nutch/trunk/build.xml lucene/nutch/trunk/default.properties lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=377501r1=377500r2=377501view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Mon Feb 13 13:43:15 2006 @@ -249,6 +249,7 @@ packageset dir=${src.dir}/ packageset dir=${plugins.dir}/lib-http/src/java/ + packageset dir=${plugins.dir}/lib-parsems/src/java/ packageset dir=${plugins.dir}/ontology/src/java/ packageset dir=${plugins.dir}/protocol-file/src/java/ packageset dir=${plugins.dir}/protocol-ftp/src/java/ Modified: lucene/nutch/trunk/default.properties URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/default.properties?rev=377501r1=377500r2=377501view=diff == --- lucene/nutch/trunk/default.properties (original) +++ lucene/nutch/trunk/default.properties Mon Feb 13 13:43:15 2006 @@ -68,6 +68,7 @@ plugin.msword=org.apache.nutch.parse.msword* # Unfortunately, ontology on core and plugin uses the same package: # plugin.ontology=org.apache.nutch.ontology* +plugin.parsems=org.apache.nutch.parse.ms* plugin.pdf=org.apache.nutch.parse.pdf* plugin.rss=org.apache.nutch.parse.rss* plugin.rtf=org.apache.nutch.parse.rtf* @@ -95,6 +96,7 @@ ${plugin.msexcel}:\ ${plugin.mspowerpoint}:\ ${plugin.msword}:\ + ${plugin.parsems}:\ ${plugin.pdf}:\ ${plugin.rss}:\ ${plugin.rtf}:\ Modified: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java?rev=377501r1=377500r2=377501view=diff == --- lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java (original) +++ lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java Mon Feb 13 13:43:15 2006 @@ -56,7 +56,7 @@ /** * Parses a Content with a specific [EMAIL PROTECTED] MSExtractor Microsoft document - * extractor. + * extractor}. */ protected Parse getParse(MSExtractor extractor, Content content) { Added: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/package.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/package.html?rev=377501view=auto == --- lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/package.html (added) +++ lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/package.html Mon Feb 13 13:43:15 2006 @@ -0,0 +1,5 @@ +html +body +pCommon API for Microsoft copy; documents parsing./p +/body +/html Propchange: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/package.html -- svn:eol-style = native Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java?rev=377501r1=377500r2=377501view=diff == --- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java (original) +++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java Mon Feb 13 13:43:15 2006 @@ -23,7 +23,6 @@ * * @author Stephan Strittmatter - http://www.sybit.de * @version 1.0 - * @create 19.01.2005 */ public class FilteredStringWriter extends StringWriter { @@ -67,4 +66,4 @@ super.write(ch); } } -} \ No newline at end of file +}
svn commit: r376966 - /lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
Author: jerome Date: Sat Feb 11 02:56:14 2006 New Revision: 376966 URL: http://svn.apache.org/viewcvs?rev=376966view=rev Log: Fix parse-msexcel unit tests Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java?rev=376966r1=376965r2=376966view=diff == --- lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java Sat Feb 11 02:56:14 2006 @@ -4,18 +4,25 @@ */ package org.apache.nutch.parse.msexcel; +// Nutch imports +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.protocol.ProtocolFactory; import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.ProtocolException; - -import org.apache.nutch.parse.ParserFactory; -import org.apache.nutch.parse.Parser; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseException; +import org.apache.nutch.util.NutchConfiguration; +// JUnit imports import junit.framework.TestCase; +// Hadoop imports +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.UTF8; + + /** * Based on Unit tests for MSWordParser by John Xing * @@ -31,31 +38,32 @@ private String[] sampleFiles = {test.xls}; - private String expectedText = BitStream test.xls 321654.0 Apache incubator 1234.0 Doug Cutting 89078.0 CS 599 Search Engines Spring 2005.0 SBC 1234.0 764893.0 Java NUTCH!! ; + private String expectedText = BitStream test.xls 321654.0 Apache + +incubator 1234.0 Doug Cutting 89078.0 + +CS 599 Search Engines Spring 2005.0 SBC + +1234.0 764893.0 Java NUTCH!! ; public TestMSExcelParser(String name) { -super(name); +super(name); } - protected void setUp() {} - - protected void tearDown() {} - public void testIt() throws ProtocolException, ParseException { + String urlString; Protocol protocol; Content content; -Parser parser; Parse parse; +Configuration conf = NutchConfiguration.create(); +ParseUtil parser = new ParseUtil(conf); +ProtocolFactory factory = new ProtocolFactory(conf); for (int i = 0; i sampleFiles.length; i++) { urlString = file: + sampleDir + fileSeparator + sampleFiles[i]; - protocol = ProtocolFactory.getProtocol(urlString); - content = protocol.getContent(urlString); - - parser = ParserFactory.getParser(content.getContentType(), urlString); - parse = parser.getParse(content); + protocol = factory.getProtocol(urlString); + content = protocol.getProtocolOutput(new UTF8(urlString), + new CrawlDatum()).getContent(); + parse = parser.parseByParserId(parse-msexcel, content); assertTrue(parse.getText().equals(expectedText)); }
svn commit: r376638 - in /lucene/nutch/trunk/src/plugin/parse-msword: lib/poi-2.1-20040508.jar lib/poi-scratchpad-2.1-20040508.jar plugin.xml
Author: jerome Date: Fri Feb 10 03:24:37 2006 New Revision: 376638 URL: http://svn.apache.org/viewcvs?rev=376638view=rev Log: Remove no more used POI libs Removed: lucene/nutch/trunk/src/plugin/parse-msword/lib/poi-2.1-20040508.jar lucene/nutch/trunk/src/plugin/parse-msword/lib/poi-scratchpad-2.1-20040508.jar Modified: lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml Modified: lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml?rev=376638r1=376637r2=376638view=diff == --- lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml Fri Feb 10 03:24:37 2006 @@ -9,8 +9,6 @@ library name=parse-msword.jar export name=*/ /library - library name=poi-2.1-20040508.jar/ - library name=poi-scratchpad-2.1-20040508.jar/ /runtime requires
svn commit: r376315 - /lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
Author: jerome Date: Thu Feb 9 07:14:17 2006 New Revision: 376315 URL: http://svn.apache.org/viewcvs?rev=376315view=rev Log: Add a default constructor to avoid failure when JobConf tries to instanciate ParseSegment on parse command. (Just a workaround for HADOOP-29 issue). Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=376315r1=376314r2=376315view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Thu Feb 9 07:14:17 2006 @@ -35,7 +35,11 @@ public static final Logger LOG = LogFormatter.getLogger(Parser.class.getName()); - + + public ParseSegment() { +this(null); + } + public ParseSegment(Configuration conf) { super(conf); }
svn commit: r376478 - /lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
Author: jerome Date: Thu Feb 9 15:12:59 2006 New Revision: 376478 URL: http://svn.apache.org/viewcvs?rev=376478view=rev Log: Ensure ParseData content metadata contains segment name and score Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=376478r1=376477r2=376478view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Feb 9 15:12:59 2006 @@ -235,6 +235,9 @@ byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse); metadata.set(SIGNATURE_KEY, StringUtil.toHexString(signature)); datum.setSignature(signature); +// Ensure segment name and score are in parseData metadata +parse.getData().getContentMeta().set(SEGMENT_NAME_KEY, segmentName); +parse.getData().getContentMeta().set(SCORE_KEY, Float.toString(datum.getScore())); } try {
svn commit: r376522 - /lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
Author: jerome Date: Thu Feb 9 17:08:15 2006 New Revision: 376522 URL: http://svn.apache.org/viewcvs?rev=376522view=rev Log: Ensure signature added in content metadata Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=376522r1=376521r2=376522view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Feb 9 17:08:15 2006 @@ -237,7 +237,8 @@ datum.setSignature(signature); // Ensure segment name and score are in parseData metadata parse.getData().getContentMeta().set(SEGMENT_NAME_KEY, segmentName); -parse.getData().getContentMeta().set(SCORE_KEY, Float.toString(datum.getScore())); +parse.getData().getContentMeta().set(SCORE_KEY, Float.toString(datum.getScore())); +parse.getData().getContentMeta().set(SIGNATURE_KEY, StringUtil.toHexString(signature)); } try {
svn commit: r375965 - in /lucene/nutch/trunk: build.xml src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html
Author: jerome Date: Wed Feb 8 05:58:08 2006 New Revision: 375965 URL: http://svn.apache.org/viewcvs?rev=375965view=rev Log: Fix some javadoc issues with lib-http plugin Added: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html (with props) Modified: lucene/nutch/trunk/build.xml Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=375965r1=375964r2=375965view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Wed Feb 8 05:58:08 2006 @@ -223,6 +223,7 @@ bottom=Copyright amp;copy; ${year} The Apache Software Foundation packageset dir=${src.dir}/ + packageset dir=${plugins.dir}/lib-http/src/java/ packageset dir=${plugins.dir}/protocol-file/src/java/ packageset dir=${plugins.dir}/protocol-ftp/src/java/ packageset dir=${plugins.dir}/protocol-http/src/java/ Added: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html?rev=375965view=auto == --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html (added) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html Wed Feb 8 05:58:08 2006 @@ -0,0 +1,6 @@ +html +body +pCommon API used by HTTP plugins ([EMAIL PROTECTED] org.apache.nutch.protocol.http http}, [EMAIL PROTECTED] org.apache.nutch.protocol.httpclient httpclient})/p +/body +/html Propchange: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html -- svn:eol-style = native
svn commit: r375984 - in /lucene/nutch/trunk/src: java/org/apache/nutch/parse/ java/org/apache/nutch/util/ plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/
Author: jerome Date: Wed Feb 8 07:42:44 2006 New Revision: 375984 URL: http://svn.apache.org/viewcvs?rev=375984view=rev Log: Fix some javadoc errors and warnings Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java?rev=375984r1=375983r2=375984view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java Wed Feb 8 07:42:44 2006 @@ -64,9 +64,9 @@ /** * Reads the codeparse-plugins.xml/code file and returns the - * [EMAIL PROTECTED] ParsePluginPreferenceList} defined by it. + * [EMAIL PROTECTED] ParsePluginList} defined by it. * - * @return A [EMAIL PROTECTED] ParsePluginPreferenceList} specified by the + * @return A [EMAIL PROTECTED] ParsePluginList} specified by the * codeparse-plugins.xml/code file. * @throws Exception * If any parsing error occurs. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java?rev=375984r1=375983r2=375984view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java Wed Feb 8 07:42:44 2006 @@ -51,7 +51,7 @@ } /** - * Performs a parse by iterating through a List of preferred [EMAIL PROTECTED] + * Performs a parse by iterating through a List of preferred [EMAIL PROTECTED] Parser}s * until a successful parse is performed and a [EMAIL PROTECTED] Parse} object is * returned. If the parse is unsuccessful, a message is logged to the * codeWARNING/code level, and an empty parse is returned. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java?rev=375984r1=375983r2=375984view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Wed Feb 8 07:42:44 2006 @@ -106,13 +106,13 @@ * * The function consults the internal [EMAIL PROTECTED] ParsePluginList} for the * ParserFactory to determine the list of pluginIds, then gets the - * appropriate extension points to instantiate as {Parser}s. + * appropriate extension points to instantiate as [EMAIL PROTECTED] Parser}s. * * @param contentType The contentType to return the codeArray/code - *of {Parser}s for. + *of [EMAIL PROTECTED] Parser}s for. * @param url The url for the content that may allow us to get the type from *the file suffix. - * @return An codeArray/code of [EMAIL PROTECTED] for the given contentType. + * @return An codeArray/code of [EMAIL PROTECTED] Parser}s for the given contentType. * If there were plugins mapped to a contentType via the * codeparse-plugins.xml/code file, but never enabled via * the codeplugin.includes/code Nutch conf, then those plugins Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java?rev=375984r1=375983r2=375984view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java Wed Feb 8 07:42:44 2006 @@ -57,7 +57,6 @@ * Convenience call for [EMAIL PROTECTED] #toHexString(byte[], String, int)}, where * codesep = null; lineLen = Integer.MAX_VALUE/code. * @param buf - * @return */ public static String toHexString(byte[] buf) { return toHexString(buf, null, Integer.MAX_VALUE); Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src
svn commit: r376012 - in /lucene/nutch/trunk: build.xml default.properties
Author: jerome Date: Wed Feb 8 10:03:01 2006 New Revision: 376012 URL: http://svn.apache.org/viewcvs?rev=376012view=rev Log: Add/Move some plugins javadoc to the Plugins group Modified: lucene/nutch/trunk/build.xml lucene/nutch/trunk/default.properties Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=376012r1=376011r2=376012view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Wed Feb 8 10:03:01 2006 @@ -227,20 +227,28 @@ packageset dir=${src.dir}/ packageset dir=${plugins.dir}/lib-http/src/java/ + packageset dir=${plugins.dir}/ontology/src/java/ packageset dir=${plugins.dir}/protocol-file/src/java/ packageset dir=${plugins.dir}/protocol-ftp/src/java/ packageset dir=${plugins.dir}/protocol-http/src/java/ packageset dir=${plugins.dir}/protocol-httpclient/src/java/ + packageset dir=${plugins.dir}/parse-ext/src/java/ packageset dir=${plugins.dir}/parse-html/src/java/ packageset dir=${plugins.dir}/parse-js/src/java/ packageset dir=${plugins.dir}/parse-text/src/java/ packageset dir=${plugins.dir}/parse-pdf/src/java/ !-- packageset dir=${plugins.dir}/parse-rtf/src/java/ plugin excluded from build due to licensing issues-- !-- packageset dir=${plugins.dir}/parse-mp3/src/java/ plugin excluded from build due to licensing issues-- + packageset dir=${plugins.dir}/parse-mspowerpoint/src/java/ packageset dir=${plugins.dir}/parse-msword/src/java/ + packageset dir=${plugins.dir}/parse-rss/src/java/ + packageset dir=${plugins.dir}/parse-swf/src/java/ + packageset dir=${plugins.dir}/parse-zip/src/java/ packageset dir=${plugins.dir}/index-basic/src/java/ packageset dir=${plugins.dir}/index-more/src/java/ packageset dir=${plugins.dir}/query-more/src/java/ + packageset dir=${plugins.dir}/query-site/src/java/ + packageset dir=${plugins.dir}/query-url/src/java/ packageset dir=${plugins.dir}/urlfilter-regex/src/java/ packageset dir=${plugins.dir}/urlfilter-prefix/src/java/ packageset dir=${plugins.dir}/creativecommons/src/java/ Modified: lucene/nutch/trunk/default.properties URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/default.properties?rev=376012r1=376011r2=376012view=diff == --- lucene/nutch/trunk/default.properties (original) +++ lucene/nutch/trunk/default.properties Wed Feb 8 10:03:01 2006 @@ -33,8 +33,6 @@ src.webapps = ./src/webapps # Proxy Host and Port to use for building JavaDoc -#javadoc.proxy.host=-J-DproxyHost= -#javadoc.proxy.port=-J-DproxyPort= javadoc.proxy.host=-J-DproxyHost= javadoc.proxy.port=-J-DproxyPort= javadoc.link.java=http://java.sun.com/j2se/1.4.2/docs/api/ @@ -49,21 +47,57 @@ javac.deprecation=off javac.version= 1.4 -plugin.http=org.apache.nutch.protocol.http* -plugin.httpclient=org.apache.nutch.protocol.httpclient* -plugin.ftp=org.apache.nutch.protocol.ftp* +# The list of packages assigned to plugins group in javadoc +# (please keep this list ordered) +plugin.basic=org.apache.nutch.indexer.basic* +plugin.carrot2=org.apache.nutch.clustering.carrot2* +plugin.creative=org.creativecommons.nutch* +plugin.ext=org.apache.nutch.parse.ext* plugin.file=org.apache.nutch.protocol.file* +plugin.ftp=org.apache.nutch.protocol.ftp* plugin.html=org.apache.nutch.parse.html* +plugin.http=org.apache.nutch.protocol.http* +plugin.httpclient=org.apache.nutch.protocol.httpclient* plugin.js=org.apache.nutch.parse.js* +plugin.language=org.apache.nutch.analysis.lang* +plugin.libhttp=org.apache.nutch.protocol.http.api* +plugin.more=org.apache.nutch.indexer.more*:org.apache.nutch.searcher.more* plugin.mp3=org.apache.nutch.parse.mp3* +plugin.mspowerpoint=org.apache.nutch.parse.mspowerpoint* plugin.msword=org.apache.nutch.parse.msword* -plugin.rtf=org.apache.nutch.parse.rtf* +# Unfortunately, ontology on core and plugin uses the same package: +# plugin.ontology=org.apache.nutch.ontology* plugin.pdf=org.apache.nutch.parse.pdf* +plugin.rss=org.apache.nutch.parse.rss* +plugin.rtf=org.apache.nutch.parse.rtf* +plugin.site=org.apache.nutch.searcher.site* +plugin.swf=org.apache.nutch.parse.swf* plugin.text=org.apache.nutch.parse.text* -plugin.basic=org.apache.nutch.indexer.basic* -plugin.more=org.apache.nutch.indexer.more* -plugin.language=org.apache.nutch.analysis.lang* -plugin.creative=org.creativecommons.nutch* -plugins.packages=${plugin.http}:${plugin.httpclient}:${plugin.ftp}:${plugin.file}:${plugin.html}:${plugin.js}:${plugin.mp3}:\ - ${plugin.msword}:${plugin.rtf}:${plugin.pdf}:${plugin.text}:${plugin.basic}:${plugin.more}:\ - ${plugin.language}:${plugin.creative} +plugin.url=org.apache.nutch.searcher.url* +plugin.zip
svn commit: r368172 - in /lucene/nutch/trunk: docs/ca/ docs/de/ docs/en/ docs/es/ docs/fi/ docs/fr/ docs/hu/ docs/jp/ docs/ms/ docs/nl/ docs/pl/ docs/pt/ docs/sv/ docs/th/ docs/zh/ src/java/org/apache
Author: jerome Date: Wed Jan 11 15:50:13 2006 New Revision: 368172 URL: http://svn.apache.org/viewcvs?rev=368172view=rev Log: Add a style for summary highlight and ellipsis Modified: lucene/nutch/trunk/docs/ca/about.html lucene/nutch/trunk/docs/ca/help.html lucene/nutch/trunk/docs/ca/search.html lucene/nutch/trunk/docs/de/about.html lucene/nutch/trunk/docs/de/help.html lucene/nutch/trunk/docs/de/search.html lucene/nutch/trunk/docs/en/about.html lucene/nutch/trunk/docs/en/help.html lucene/nutch/trunk/docs/en/search.html lucene/nutch/trunk/docs/es/about.html lucene/nutch/trunk/docs/es/help.html lucene/nutch/trunk/docs/es/search.html lucene/nutch/trunk/docs/fi/about.html lucene/nutch/trunk/docs/fi/help.html lucene/nutch/trunk/docs/fi/search.html lucene/nutch/trunk/docs/fr/about.html lucene/nutch/trunk/docs/fr/search.html lucene/nutch/trunk/docs/hu/about.html lucene/nutch/trunk/docs/hu/help.html lucene/nutch/trunk/docs/hu/search.html lucene/nutch/trunk/docs/jp/about.html lucene/nutch/trunk/docs/jp/help.html lucene/nutch/trunk/docs/jp/search.html lucene/nutch/trunk/docs/ms/about.html lucene/nutch/trunk/docs/ms/help.html lucene/nutch/trunk/docs/ms/search.html lucene/nutch/trunk/docs/nl/about.html lucene/nutch/trunk/docs/nl/help.html lucene/nutch/trunk/docs/nl/search.html lucene/nutch/trunk/docs/pl/about.html lucene/nutch/trunk/docs/pl/help.html lucene/nutch/trunk/docs/pl/search.html lucene/nutch/trunk/docs/pt/about.html lucene/nutch/trunk/docs/pt/help.html lucene/nutch/trunk/docs/pt/search.html lucene/nutch/trunk/docs/sv/about.html lucene/nutch/trunk/docs/sv/help.html lucene/nutch/trunk/docs/sv/search.html lucene/nutch/trunk/docs/th/about.html lucene/nutch/trunk/docs/th/help.html lucene/nutch/trunk/docs/th/search.html lucene/nutch/trunk/docs/zh/about.html lucene/nutch/trunk/docs/zh/help.html lucene/nutch/trunk/docs/zh/search.html lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summary.java lucene/nutch/trunk/src/web/include/style.html Modified: lucene/nutch/trunk/docs/ca/about.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/docs/ca/about.html?rev=368172r1=368171r2=368172view=diff == --- lucene/nutch/trunk/docs/ca/about.html (original) +++ lucene/nutch/trunk/docs/ca/about.html Wed Jan 11 15:50:13 2006 @@ -16,6 +16,8 @@ h3 {font-family: Arial, Helvetica, sans-serif; font-size: 16px; color: #00;} h4 {font-family: Arial, Helvetica, sans-serif; font-size: 14px; color: #00;} .url {color: #996600;} +.highlight {font-weight: bold;} +.ellipsis {font-weight: bold;} /style link type=image/x-icon href=../img/favicon.ico rel=icon link type=image/x-icon href=../img/favicon.ico rel=shortcut icon Modified: lucene/nutch/trunk/docs/ca/help.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/docs/ca/help.html?rev=368172r1=368171r2=368172view=diff == --- lucene/nutch/trunk/docs/ca/help.html (original) +++ lucene/nutch/trunk/docs/ca/help.html Wed Jan 11 15:50:13 2006 @@ -16,6 +16,8 @@ h3 {font-family: Arial, Helvetica, sans-serif; font-size: 16px; color: #00;} h4 {font-family: Arial, Helvetica, sans-serif; font-size: 14px; color: #00;} .url {color: #996600;} +.highlight {font-weight: bold;} +.ellipsis {font-weight: bold;} /style link type=image/x-icon href=../img/favicon.ico rel=icon link type=image/x-icon href=../img/favicon.ico rel=shortcut icon Modified: lucene/nutch/trunk/docs/ca/search.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/docs/ca/search.html?rev=368172r1=368171r2=368172view=diff == --- lucene/nutch/trunk/docs/ca/search.html (original) +++ lucene/nutch/trunk/docs/ca/search.html Wed Jan 11 15:50:13 2006 @@ -16,6 +16,8 @@ h3 {font-family: Arial, Helvetica, sans-serif; font-size: 16px; color: #00;} h4 {font-family: Arial, Helvetica, sans-serif; font-size: 14px; color: #00;} .url {color: #996600;} +.highlight {font-weight: bold;} +.ellipsis {font-weight: bold;} /style link type=image/x-icon href=../img/favicon.ico rel=icon link type=image/x-icon href=../img/favicon.ico rel=shortcut icon Modified: lucene/nutch/trunk/docs/de/about.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/docs/de/about.html?rev=368172r1=368171r2=368172view=diff == --- lucene/nutch/trunk/docs/de/about.html (original) +++ lucene/nutch/trunk/docs/de/about.html Wed Jan 11 15:50:13 2006 @@ -16,6 +16,8 @@ h3 {font-family: Arial, Helvetica, sans-serif; font-size: 16px; color: #00;} h4 {font-family: Arial, Helvetica, sans-serif; font-size: 14px; color: #00;} .url
svn commit: r367405 - /lucene/nutch/trunk/src/plugin/build.xml
Author: jerome Date: Mon Jan 9 13:45:25 2006 New Revision: 367405 URL: http://svn.apache.org/viewcvs?rev=367405view=rev Log: Remove deployment of analysis plugins (under dev) Remove protocol-http unit tests (moved to lib-http) Modified: lucene/nutch/trunk/src/plugin/build.xml Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=367405r1=367404r2=367405view=diff == --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Mon Jan 9 13:45:25 2006 @@ -6,8 +6,6 @@ !-- Build deploy all the plugin jars.-- !-- == -- target name=deploy - ant dir=analysis-de target=deploy/ - ant dir=analysis-fr target=deploy/ ant dir=clustering-carrot2 target=deploy/ ant dir=creativecommons target=deploy/ ant dir=index-basic target=deploy/ @@ -47,8 +45,8 @@ target name=test ant dir=creativecommons target=test/ ant dir=languageidentifier target=test/ + ant dir=lib-http target=test/ ant dir=ontology target=test/ - ant dir=protocol-http target=test/ ant dir=parse-ext target=test/ ant dir=parse-html target=test/ !-- ant dir=parse-mp3 target=test/ -- @@ -71,6 +69,7 @@ ant dir=index-basic target=clean/ ant dir=index-more target=clean/ ant dir=languageidentifier target=clean/ +ant dir=lib-http target=clean/ ant dir=lib-jakarta-poi target=clean/ ant dir=lib-lucene-analyzers target=clean/ ant dir=nutch-extensionpoints target=clean/
svn commit: r357334 - in /lucene/nutch/trunk: conf/nutch-default.xml src/java/org/apache/nutch/protocol/Content.java src/java/org/apache/nutch/protocol/ContentProperties.java
Author: jerome Date: Sat Dec 17 02:06:31 2005 New Revision: 357334 URL: http://svn.apache.org/viewcvs?rev=357334view=rev Log: NUTCH-3, ContentProperties can handle multivalued properties (S. Groschupf) Modified: lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=357334r1=357333r2=357334view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Sat Dec 17 02:06:31 2005 @@ -105,7 +105,7 @@ property namehttp.redirect.max/name - value3/value + value10/value descriptionThe maximum number of redirects the fetcher will follow when trying to fetch a page./description /property @@ -727,7 +727,8 @@ property nameparser.character.encoding.default/name - valuewindows-1252/value + !--valuewindows-1252/value-- + valueutf-8/value descriptionThe character encoding to fall back to when no other information is available/description /property Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?rev=357334r1=357333r2=357334view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Sat Dec 17 02:06:31 2005 @@ -77,11 +77,8 @@ contentType = UTF8.readString(in);// read contentType -int propertyCount = in.readInt(); // read metadata metadata = new ContentProperties(); -for (int i = 0; i propertyCount; i++) { - metadata.put(UTF8.readString(in), UTF8.readString(in)); -} +metadata.readFields(in);// read meta data } protected final void writeCompressed(DataOutput out) throws IOException { @@ -95,13 +92,7 @@ UTF8.writeString(out, contentType); // write contentType -out.writeInt(metadata.size());// write metadata -Iterator i = metadata.entrySet().iterator(); -while (i.hasNext()) { - Map.Entry e = (Map.Entry)i.next(); - UTF8.writeString(out, (String)e.getKey()); - UTF8.writeString(out, (String)e.getValue()); -} +metadata.write(out); // write metadata } public static Content read(DataInput in) throws IOException { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java?rev=357334r1=357333r2=357334view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java Sat Dec 17 02:06:31 2005 @@ -16,15 +16,22 @@ package org.apache.nutch.protocol; +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.ArrayList; import java.util.Enumeration; import java.util.Iterator; import java.util.Properties; import java.util.TreeMap; +import org.apache.nutch.io.UTF8; +import org.apache.nutch.io.Writable; + /** - * case insensitive properties + * writable case insensitive properties */ -public class ContentProperties extends TreeMap { +public class ContentProperties extends TreeMap implements Writable { /** * construct the TreeMap with a case insensitive comparator @@ -51,6 +58,36 @@ return (String) get(key); } +/* + * (non-Javadoc) + * + * @see java.util.Map#get(java.lang.Object) + */ +public Object get(Object arg0) { +Object object = super.get(arg0); +if (object != null object instanceof ArrayList) { +ArrayList list = (ArrayList) object; +return list.get(list.size() - 1); +} +return object; +} + +/** + * @param key + * @return the properties as a string array if there is no such property we + * retunr a array with 0 entries + */ +public String[] getProperties(String key) { +Object object = super.get(key); +if (object != null !(object instanceof ArrayList)) { +return new String[] { (String) object }; +} else if (object != null object instanceof ArrayList) { +ArrayList list = (ArrayList) object; +return (String[]) list.toArray(new String[list.size()]); +} +return new String[0
svn commit: r355809 - in /lucene/nutch/trunk/src: java/org/apache/nutch/protocol/ java/org/apache/nutch/util/mime/ plugin/protocol-file/src/java/org/apache/nutch/protocol/file/ plugin/protocol-ftp/src
Author: jerome Date: Sat Dec 10 15:47:18 2005 New Revision: 355809 URL: http://svn.apache.org/viewcvs?rev=355809view=rev Log: Content-Type resolution enhancements: * Resolution moved from protocol plugins to Content constructor * Best content-type guessing policy * Some unit tests added Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeTypes.java lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?rev=355809r1=355808r2=355809view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Sat Dec 10 15:47:18 2005 @@ -22,6 +22,10 @@ import org.apache.nutch.io.*; import org.apache.nutch.fs.*; import org.apache.nutch.util.*; +import org.apache.nutch.util.mime.MimeType; +import org.apache.nutch.util.mime.MimeTypes; +import org.apache.nutch.util.mime.MimeTypeException; + public final class Content extends VersionedWritable { @@ -29,6 +33,14 @@ private final static byte VERSION = 1; + /** A flag that tells if magic resolution must be performed */ + private final static boolean MAGIC = +NutchConf.get().getBoolean(mime.type.magic, true); + + /** Get the MimeTypes resolver instance. */ + private final static MimeTypes MIME = +MimeTypes.get(NutchConf.get().get(mime.types.file)); + private String url; private String base; private byte[] content; @@ -38,18 +50,17 @@ public Content() {} public Content(String url, String base, byte[] content, String contentType, - Properties metadata){ + Properties metadata) { if (url == null) throw new IllegalArgumentException(null url); if (base == null) throw new IllegalArgumentException(null base); if (content == null) throw new IllegalArgumentException(null content); -if (contentType == null) throw new IllegalArgumentException(null type); if (metadata == null) throw new IllegalArgumentException(null metadata); this.url = url; this.base = base; this.content = content; -this.contentType = contentType; +this.contentType = getContentType(contentType, url, content); this.metadata = metadata; } @@ -185,4 +196,33 @@ nfs.close(); } } + + private String getContentType(String typeName, String url, byte[] data) { + +MimeType type = null; +try { +typeName = MimeType.clean(typeName); +type = typeName == null ? null : MIME.forName(typeName); +} catch (MimeTypeException mte) { +// Seems to be a malformed mime type name... +} + +if (typeName == null || type == null || !type.matches(url)) { + // If no mime-type header, or cannot find a corresponding registered + // mime-type, or the one found doesn't match the url pattern + // it shouldbe, then guess a mime-type from the url pattern + type = MIME.getMimeType(url); + typeName = type == null ? typeName : type.getName(); +} +if (typeName == null || type == null || +(MAGIC type.hasMagic() !type.matches(data))) { + // If no mime-type already found, or the one found doesn't match + // the magic bytes it should be, then, guess a mime-type from the + // document content (magic bytes) + type = MIME.getMimeType(data); + typeName = type == null ? typeName : type.getName(); +} +return typeName; + } + } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java?rev=355809r1=355808r2=355809view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java Sat Dec 10 15:47:18 2005 @@ -227,11 +227,21 @@ return minLength; } -boolean hasMagic() { +public boolean hasMagic() { return (magics.size() 0); } -boolean matches(byte[] data) { +public boolean
svn commit: r355828 - in /lucene/nutch/trunk/src: java/org/apache/nutch/fetcher/ java/org/apache/nutch/parse/ java/org/apache/nutch/protocol/ java/org/apache/nutch/servlet/ java/org/apache/nutch/tools
Author: jerome Date: Sat Dec 10 16:36:57 2005 New Revision: 355828 URL: http://svn.apache.org/viewcvs?rev=355828view=rev Log: NUTCH-135 : Content metadata are now case insensitive (thanks to S. Groschupf) Added: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java (with props) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java lucene/nutch/trunk/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java lucene/nutch/trunk/src/web/jsp/cached.jsp Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=355828r1=355827r2=355828view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Sat Dec 10 16:36:57 2005 @@ -234,7 +234,7 @@ MD5Hash hash = null; String url = fle.getPage().getURL().toString(); if (content == null) { -content = new Content(url, url, new byte[0], , new Properties()); +content = new Content(url, url, new byte[0], , new ContentProperties()); hash = MD5Hash.digest(url); } else { hash = MD5Hash.digest(content.getContent()); @@ -263,7 +263,7 @@ + status.toString()); outputPage(new FetcherOutput(fle, hash, protocolStatus), content, new ParseText(), -new ParseData(status, , new Outlink[0], new Properties())); +new ParseData(status, , new Outlink[0], new ContentProperties())); } return status; } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java?rev=355828r1=355827r2=355828view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java Sat Dec 10 16:36:57 2005 @@ -21,6 +21,7 @@ import org.apache.nutch.io.*; import org.apache.nutch.fs.*; +import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.tools.UpdateDatabaseTool; @@ -34,12 +35,12 @@ private String title; private Outlink[] outlinks; - private Properties metadata; + private ContentProperties
svn commit: r354399 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/plugin/PluginRepository.java
Author: jerome Date: Tue Dec 6 02:51:06 2005 New Revision: 354399 URL: http://svn.apache.org/viewcvs?rev=354399view=rev Log: Merge from trunk 354397:354398 - Improvements in plugin circular dependencies detection Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/plugin/PluginRepository.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/plugin/PluginRepository.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/plugin/PluginRepository.java?rev=354399r1=354398r2=354399view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/plugin/PluginRepository.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/plugin/PluginRepository.java Tue Dec 6 02:51:06 2005 @@ -116,13 +116,20 @@ } private void getPluginCheckedDependencies(PluginDescriptor plugin, - Map plugins, Map dependencies) + Map plugins, + Map dependencies, + Map branch) throws MissingDependencyException, CircularDependencyException { if (dependencies == null) { dependencies = new HashMap(); } + if (branch == null) { branch = new HashMap(); } + branch.put(plugin.getPluginId(), plugin); + // Get the plugin dependencies String[] ids = plugin.getDependencies(); + + // Otherwise, checks each dependency for (int i=0; iids.length; i++) { String id = ids[i]; PluginDescriptor dependency = (PluginDescriptor) plugins.get(id); @@ -131,15 +138,17 @@ Missing dependency + id + for plugin + plugin.getPluginId()); } -if (dependencies.containsKey(id) !id.equals(nutch-extensionpoints)) { +if (branch.containsKey(id)) { throw new CircularDependencyException( Circular dependency detected + id + for plugin + plugin.getPluginId()); } dependencies.put(id, dependency); getPluginCheckedDependencies((PluginDescriptor) plugins.get(id), - plugins, dependencies); + plugins, dependencies, branch); } + + branch.remove(plugin.getPluginId()); } private Map getPluginCheckedDependencies(PluginDescriptor plugin, @@ -147,7 +156,8 @@ throws MissingDependencyException, CircularDependencyException { Map dependencies = new HashMap(); - getPluginCheckedDependencies(plugin, plugins, dependencies); + Map branch = new HashMap(); + getPluginCheckedDependencies(plugin, plugins, dependencies, branch); return dependencies; }
svn commit: r354575 - /lucene/nutch/trunk/src/web/jsp/cached.jsp
Author: jerome Date: Tue Dec 6 13:42:25 2005 New Revision: 354575 URL: http://svn.apache.org/viewcvs?rev=354575view=rev Log: NUTCH-112, link to cached content changed to relative (C. Mattmann) Modified: lucene/nutch/trunk/src/web/jsp/cached.jsp Modified: lucene/nutch/trunk/src/web/jsp/cached.jsp URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/web/jsp/cached.jsp?rev=354575r1=354574r2=354575view=diff == --- lucene/nutch/trunk/src/web/jsp/cached.jsp (original) +++ lucene/nutch/trunk/src/web/jsp/cached.jsp Tue Dec 6 13:42:25 2005 @@ -74,6 +74,6 @@ % } else { % The cached content has mime type %=contentType%, -click this a href=/servlet/cached?%=id%link/a to download it directly. +click this a href=./servlet/cached?%=id%link/a to download it directly. % } %
svn commit: r354582 - /lucene/nutch/branches/mapred/src/web/jsp/cached.jsp
Author: jerome Date: Tue Dec 6 14:00:12 2005 New Revision: 354582 URL: http://svn.apache.org/viewcvs?rev=354582view=rev Log: NUTCH-112, merged from trunk Modified: lucene/nutch/branches/mapred/src/web/jsp/cached.jsp Modified: lucene/nutch/branches/mapred/src/web/jsp/cached.jsp URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/web/jsp/cached.jsp?rev=354582r1=354581r2=354582view=diff == --- lucene/nutch/branches/mapred/src/web/jsp/cached.jsp (original) +++ lucene/nutch/branches/mapred/src/web/jsp/cached.jsp Tue Dec 6 14:00:12 2005 @@ -74,6 +74,6 @@ % } else { % The cached content has mime type %=contentType%, -click this a href=/servlet/cached?%=id%link/a to download it directly. +click this a href=./servlet/cached?%=id%link/a to download it directly. % } %
svn commit: r293370 - in /lucene/nutch/trunk/src: java/org/apache/nutch/parse/ParsePluginsReader.java test/org/apache/nutch/parse/TestParserFactory.java
Author: jerome Date: Mon Oct 3 08:56:43 2005 New Revision: 293370 URL: http://svn.apache.org/viewcvs?rev=293370view=rev Log: Change the way the parse-plugin.xml file is loaded (MalformedURLException reported by Earl Cahill) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java?rev=293370r1=293369r2=293370view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java Mon Oct 3 08:56:43 2005 @@ -54,14 +54,17 @@ public static final Logger LOG = LogFormatter.getLogger(ParsePluginsReader.class.getName()); + /** The property name of the parse-plugins location */ + private static final String PP_FILE_PROP = parse.plugin.file; + /* the parse-plugins file */ - private String fParsePluginsFile = parse-plugins.xml; + private String fParsePluginsFile = null; + /** * Constructs a new ParsePluginsReader */ - public ParsePluginsReader() { - } + public ParsePluginsReader() { } /** * Reads the codeparse-plugins.xml/code file and returns the @@ -82,33 +85,22 @@ Document document = null; InputSource inputSource = null; -//check to see if the Nutch conf property -//parse.plugin.file is defined -String parsePluginFileUrl = NutchConf.get().get(parse.plugin.file); - InputStream ppInputStream = null; - -if (parsePluginFileUrl != null) { -URL parsePluginUrl = null; - -try { -parsePluginUrl = new URL(parsePluginFileUrl); -ppInputStream = parsePluginUrl.openStream(); -} catch (MalformedURLException e) { -LOG.log(Level.SEVERE, -Unable to load parse plugins file from URL [ -+ parsePluginFileUrl + ], e); -return null; -} catch (IOException e) { -LOG.log(Level.SEVERE, -Unable to load parse plugins file from URL [ -+ parsePluginFileUrl + ], e); -return null; -} -} else { -ppInputStream = NutchConf.get().getConfResourceAsInputStream( -fParsePluginsFile); -} +if (fParsePluginsFile != null) { + URL parsePluginUrl = null; + try { +parsePluginUrl = new URL(fParsePluginsFile); +ppInputStream = parsePluginUrl.openStream(); + } catch (Exception e) { +LOG.log(Level.SEVERE, +Unable to load parse plugins file from URL + +[ + fParsePluginsFile + ], e); +return pList; + } +} else { + ppInputStream = NutchConf.get().getConfResourceAsInputStream( + NutchConf.get().get(PP_FILE_PROP)); +} inputSource = new InputSource(ppInputStream); Modified: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java?rev=293370r1=293369r2=293370view=diff == --- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java Mon Oct 3 08:56:43 2005 @@ -37,30 +37,13 @@ */ public class TestParserFactory extends TestCase { - private NutchConf conf = null; public TestParserFactory(String name) { super(name); } - - private void initNutchConf(String testFile) { -// set the Nutch Conf property for parse.plugin.file.url -// to ${test.src.dir}/org/apache/nutch/parse/parse-plugin-test.xml -String testParsePluginFileUrl = null; -try{ - testParsePluginFileUrl = new File(System.getProperty(test.src.dir) - +/org/apache/nutch/parse/ + testFile).toURL().toString(); - NutchConf.get().set(parse.plugin.file.url,testParsePluginFileUrl); - this.conf = NutchConf.get(); -} -catch(MalformedURLException e){ - throw new RuntimeException(Unable to load parse-plugins.xml file from URL: +testParsePluginFileUrl); -} - } - - /** Inits the Test Case: loads the Nutch Conf instance. */ + + /** Inits the Test Case with the test parse-plugin file */ protected void setUp() throws Exception { -if (conf == null) { - initNutchConf(parse-plugin-test.xml); -} +NutchConf.get().set(parse.plugin.file, +org/apache/nutch/parse/parse
svn commit: r292035 - in /lucene/nutch/trunk: conf/ src/java/org/apache/nutch/parse/ src/test/org/apache/nutch/parse/
Author: jerome Date: Tue Sep 27 13:45:37 2005 New Revision: 292035 URL: http://svn.apache.org/viewcvs?rev=292035view=rev Log: NUTCH-88, First step proposal implementation (thanks to Chris Mattmann and Sébastien Le Callonnec) Added: lucene/nutch/trunk/conf/parse-plugins.dtd (with props) lucene/nutch/trunk/conf/parse-plugins.xml (with props) lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java (with props) lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java (with props) lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java (with props) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Added: lucene/nutch/trunk/conf/parse-plugins.dtd URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.dtd?rev=292035view=auto == --- lucene/nutch/trunk/conf/parse-plugins.dtd (added) +++ lucene/nutch/trunk/conf/parse-plugins.dtd Tue Sep 27 13:45:37 2005 @@ -0,0 +1,7 @@ +!ELEMENT parse-plugins (mimeType+) +!ELEMENT mimeType (plugin+) +!ATTLIST mimeType name CDATA #REQUIRED + +!ELEMENT plugin EMPTY +!ATTLIST plugin id CDATA #REQUIRED +!ATTLIST plugin order CDATA '' \ No newline at end of file Propchange: lucene/nutch/trunk/conf/parse-plugins.dtd -- svn:eol-style = native Added: lucene/nutch/trunk/conf/parse-plugins.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.xml?rev=292035view=auto == --- lucene/nutch/trunk/conf/parse-plugins.xml (added) +++ lucene/nutch/trunk/conf/parse-plugins.xml Tue Sep 27 13:45:37 2005 @@ -0,0 +1,207 @@ +?xml version=1.0 encoding=UTF-8? +!-- + Copyright 2005 The Apache Software Foundation + + Licensed under the Apache License, Version 2.0 (the License); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an AS IS BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + Author : mattmann + Description: This xml file represents a natural ordering for which parsing + plugin should get called for a particular mimeType. +-- + +parse-plugins + + !-- by default if the mimeType is set to *, or + can't be determined, use parse-text -- + mimeType name=* + plugin id=parse-text / + /mimeType + + mimeType name=application/java + plugin id=parse-text / + /mimeType + + mimeType name=application/msword + plugin id=parse-msword / + /mimeType + + mimeType name=application/pdf + plugin id=parse-pdf / + plugin id=parse-text / + /mimeType + + mimeType name=application/postscript + plugin id=parse-pdf / + /mimeType + + mimeType name=application/rss+xml + plugin id=parse-rss / + plugin id=parse-text / + /mimeType + + mimeType name=application/vnd.ms-excel + plugin id=parse-msexcel / + /mimeType + + mimeType name=application/vnd.ms-powerpoint + plugin id=parse-mspowerpoint / + /mimeType + + mimeType name=application/vnd.wap.wbxml + plugin id=parse-text / + /mimeType + + mimeType name=application/vnd.wap.wmlc + plugin id=parse-text / + /mimeType + + mimeType name=application/vnd.wap.wmlscriptc + plugin id=parse-text / + /mimeType + + mimeType name=application/xhtml+xml + plugin id=parse-text / + /mimeType + + mimeType name=application/x-bzip2 + !-- try and parse it with the zip parser -- + plugin id=parse-zip / + /mimeType + + mimeType name=application/x-csh + plugin id=parse-text / + /mimeType + + mimeType name=application/x-gzip + !-- try and parse it with the zip parser -- + plugin id=parse-zip / + /mimeType + + mimeType name=application/x-javascript + plugin id=parse-js / + plugin id=parse-text / + /mimeType + + mimeType name=application/x-kword + !-- try and parse it with the word parser -- + plugin id=parse-msword / + /mimeType + + mimeType name=application/x-kspread + !-- try and parse
svn commit: r280549 - /lucene/nutch/trunk/src/plugin/build.xml
Author: jerome Date: Tue Sep 13 05:52:13 2005 New Revision: 280549 URL: http://svn.apache.org/viewcvs?rev=280549view=rev Log: Sorted alphabetically for easy maintenance Modified: lucene/nutch/trunk/src/plugin/build.xml Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=280549r1=280548r2=280549view=diff == --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Tue Sep 13 05:52:13 2005 @@ -6,89 +6,89 @@ !-- Build deploy all the plugin jars.-- !-- == -- target name=deploy + ant dir=clustering-carrot2 target=deploy/ + ant dir=creativecommons target=deploy/ + ant dir=index-basic target=deploy/ + ant dir=index-more target=deploy/ + ant dir=languageidentifier target=deploy/ ant dir=lib-jakarta-poi target=deploy/ ant dir=nutch-extensionpoints target=deploy/ + ant dir=ontology target=deploy/ ant dir=protocol-file target=deploy/ ant dir=protocol-ftp target=deploy/ ant dir=protocol-http target=deploy/ ant dir=protocol-httpclient target=deploy/ + ant dir=parse-ext target=deploy/ ant dir=parse-html target=deploy/ ant dir=parse-js target=deploy/ - ant dir=parse-text target=deploy/ + !-- ant dir=parse-mp3 target=deploy/ -- + ant dir=parse-mspowerpoint target=deploy/ + ant dir=parse-msword target=deploy/ ant dir=parse-pdf target=deploy/ ant dir=parse-rss target=deploy/ - ant dir=parse-msword target=deploy/ - ant dir=parse-mspowerpoint target=deploy/ -!-- ant dir=parse-mp3 target=deploy/ -- -!-- ant dir=parse-rtf target=deploy/ -- - ant dir=parse-ext target=deploy/ + !-- ant dir=parse-rtf target=deploy/ -- + ant dir=parse-text target=deploy/ ant dir=parse-zip target=deploy/ - ant dir=index-basic target=deploy/ - ant dir=index-more target=deploy/ ant dir=query-basic target=deploy/ ant dir=query-more target=deploy/ ant dir=query-site target=deploy/ ant dir=query-url target=deploy/ - ant dir=urlfilter-regex target=deploy/ ant dir=urlfilter-prefix target=deploy/ - ant dir=creativecommons target=deploy/ - ant dir=languageidentifier target=deploy/ - ant dir=clustering-carrot2 target=deploy/ - ant dir=ontology target=deploy/ + ant dir=urlfilter-regex target=deploy/ /target !-- == -- !-- Test all of the plugins. -- !-- == -- target name=test + ant dir=creativecommons target=test/ + ant dir=languageidentifier target=test/ + ant dir=ontology target=test/ ant dir=protocol-http target=test/ + ant dir=parse-ext target=test/ ant dir=parse-html target=test/ + !-- ant dir=parse-mp3 target=test/ -- + ant dir=parse-mspowerpoint target=test/ + ant dir=parse-msword target=test/ ant dir=parse-pdf target=test/ ant dir=parse-rss target=test/ - ant dir=parse-msword target=test/ - ant dir=parse-mspowerpoint target=test/ - !-- ant dir=parse-mp3 target=test/ -- !-- ant dir=parse-rtf target=test/ -- - ant dir=parse-ext target=test/ ant dir=parse-zip target=test/ - ant dir=creativecommons target=test/ - ant dir=languageidentifier target=test/ - ant dir=ontology target=test/ /target !-- == -- !-- Clean all of the plugins. -- !-- == -- target name=clean +ant dir=clustering-carrot2 target=clean/ +ant dir=creativecommons target=clean/ +ant dir=index-basic target=clean/ +ant dir=index-more target=clean/ +ant dir=languageidentifier target=clean/ ant dir=lib-jakarta-poi target=clean/ ant dir=nutch-extensionpoints target=clean/ +ant dir=ontology target=clean/ ant dir=protocol-file target=clean/ ant dir=protocol-ftp target=clean/ ant dir=protocol-http target=clean/ ant dir=protocol-httpclient target=clean/ +ant dir=parse-ext target=clean/ ant dir=parse-html target=clean/ ant dir=parse-js target=clean/ -ant dir=parse-text target=clean/ +ant dir=parse-mp3 target=clean/ +ant dir=parse-mspowerpoint target=clean/ +ant dir=parse-msword target=clean/ ant dir=parse-pdf target=clean/ ant dir=parse-rss target=clean/ -ant dir=parse-msword target=clean/ -ant dir=parse-mspowerpoint target=clean/ -ant dir=parse-mp3 target=clean/ ant dir=parse-rtf target=clean/ -ant dir=parse-ext target=clean/ +ant dir=parse-text target=clean/ ant dir=parse-zip target=clean/ -ant dir=index-basic target=clean
svn commit: r280551 - in /lucene/nutch/trunk/src/plugin: build.xml lib-lucene-analyzers/ lib-lucene-analyzers/build.xml lib-lucene-analyzers/lib/ lib-lucene-analyzers/lib/lucene-analyzers-1.9-rc1-dev.jar lib-lucene-analyzers/plugin.xml
Author: jerome Date: Tue Sep 13 06:06:32 2005 New Revision: 280551 URL: http://svn.apache.org/viewcvs?rev=280551view=rev Log: Add a lib plugin for lucene analyzers Added: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/ lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/build.xml (with props) lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/ lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-1.9-rc1-dev.jar (with props) lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml (with props) Modified: lucene/nutch/trunk/src/plugin/build.xml Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=280551r1=280550r2=280551view=diff == --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Tue Sep 13 06:06:32 2005 @@ -12,6 +12,7 @@ ant dir=index-more target=deploy/ ant dir=languageidentifier target=deploy/ ant dir=lib-jakarta-poi target=deploy/ + ant dir=lib-lucene-analyzers target=deploy/ ant dir=nutch-extensionpoints target=deploy/ ant dir=ontology target=deploy/ ant dir=protocol-file target=deploy/ @@ -66,6 +67,7 @@ ant dir=index-more target=clean/ ant dir=languageidentifier target=clean/ ant dir=lib-jakarta-poi target=clean/ +ant dir=lib-lucene-analyzers target=clean/ ant dir=nutch-extensionpoints target=clean/ ant dir=ontology target=clean/ ant dir=protocol-file target=clean/ Added: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/build.xml?rev=280551view=auto == --- lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/build.xml (added) +++ lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/build.xml Tue Sep 13 06:06:32 2005 @@ -0,0 +1,17 @@ +?xml version=1.0? + +project name=lib-lucene-analyzers default=jar + + import file=../build-plugin.xml/ + + !-- + ! Override the compile and jar targets, + ! since there is nothing to compile here. + ! -- + target name=compile depends=init +echo message=Compiling plugin: ${name}/ + /target + + target name=jar depends=compile/ + +/project Propchange: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/build.xml -- svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-1.9-rc1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-1.9-rc1-dev.jar?rev=280551view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-1.9-rc1-dev.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml?rev=280551view=auto == --- lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml (added) +++ lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml Tue Sep 13 06:06:32 2005 @@ -0,0 +1,21 @@ +?xml version=1.0 encoding=UTF-8? +!-- + ! Lucene Analyzers + ! (http://lucene.apache.org/java/docs/lucene-sandbox/) + ! + ! Dowload : http://www.apache.org/dyn/closer.cgi/jakarta/lucene/binaries/ + ! License : http://www.apache.org/licenses/LICENSE-2.0.txt + !-- +plugin + id=lib-lucene-analyzers + name=Lucene Analysers + version=1.9-rc1-dev + provider-name=org.apache.lucene + + runtime + library name=lucene-analyzers-1.9-rc1-dev.jar +export name=*/ + /library + /runtime + +/plugin Propchange: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml -- svn:eol-style = native
svn commit: r280556 - in /lucene/nutch/trunk/src/plugin: ./ analysis-de/ analysis-de/src/ analysis-de/src/java/ analysis-de/src/java/org/ analysis-de/src/java/org/apache/ analysis-de/src/java/org/apache/nutch/ analysis-de/src/java/org/apache/nutch/anal...
Author: jerome Date: Tue Sep 13 07:03:36 2005 New Revision: 280556 URL: http://svn.apache.org/viewcvs?rev=280556view=rev Log: French and German analyzers added Added: lucene/nutch/trunk/src/plugin/analysis-de/ lucene/nutch/trunk/src/plugin/analysis-de/build.xml (with props) lucene/nutch/trunk/src/plugin/analysis-de/plugin.xml (with props) lucene/nutch/trunk/src/plugin/analysis-de/src/ lucene/nutch/trunk/src/plugin/analysis-de/src/java/ lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/ lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/ lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/ lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/ lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/ lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/GermanAnalyzer.java (with props) lucene/nutch/trunk/src/plugin/analysis-fr/ lucene/nutch/trunk/src/plugin/analysis-fr/build.xml (with props) lucene/nutch/trunk/src/plugin/analysis-fr/plugin.xml (with props) lucene/nutch/trunk/src/plugin/analysis-fr/src/ lucene/nutch/trunk/src/plugin/analysis-fr/src/java/ lucene/nutch/trunk/src/plugin/analysis-fr/src/java/org/ lucene/nutch/trunk/src/plugin/analysis-fr/src/java/org/apache/ lucene/nutch/trunk/src/plugin/analysis-fr/src/java/org/apache/nutch/ lucene/nutch/trunk/src/plugin/analysis-fr/src/java/org/apache/nutch/analysis/ lucene/nutch/trunk/src/plugin/analysis-fr/src/java/org/apache/nutch/analysis/fr/ lucene/nutch/trunk/src/plugin/analysis-fr/src/java/org/apache/nutch/analysis/fr/FrenchAnalyzer.java (with props) Modified: lucene/nutch/trunk/src/plugin/build.xml Added: lucene/nutch/trunk/src/plugin/analysis-de/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/analysis-de/build.xml?rev=280556view=auto == --- lucene/nutch/trunk/src/plugin/analysis-de/build.xml (added) +++ lucene/nutch/trunk/src/plugin/analysis-de/build.xml Tue Sep 13 07:03:36 2005 @@ -0,0 +1,13 @@ +?xml version=1.0? + +project name=analysis-de default=jar + + import file=../build-plugin.xml/ + + path id=plugin.deps +fileset dir=../lib-lucene-analyzers/lib + include name=*.jar / +/fileset + /path + +/project Propchange: lucene/nutch/trunk/src/plugin/analysis-de/build.xml -- svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/analysis-de/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/analysis-de/plugin.xml?rev=280556view=auto == --- lucene/nutch/trunk/src/plugin/analysis-de/plugin.xml (added) +++ lucene/nutch/trunk/src/plugin/analysis-de/plugin.xml Tue Sep 13 07:03:36 2005 @@ -0,0 +1,29 @@ +?xml version=1.0 encoding=UTF-8? +plugin + id=analysis-de + name=German Analysis Plug-in + version=1.0.0 + provider-name=org.apache.nutch + + runtime + library name=analysis-de.jar + export name=*/ + /library + /runtime + + requires + import plugin=nutch-extensionpoints/ + import plugin=lib-lucene-analyzers/ + /requires + + extension id=org.apache.nutch.analysis.de + name=GermanAnalyzer + point=org.apache.nutch.analysis.NutchAnalyzer + + implementation id=org.apache.nutch.analysis.de.GermanAnalyzer + class=org.apache.nutch.analysis.de.GermanAnalyzer + lang=de/ + + /extension + +/plugin Propchange: lucene/nutch/trunk/src/plugin/analysis-de/plugin.xml -- svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/GermanAnalyzer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/GermanAnalyzer.java?rev=280556view=auto == --- lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/GermanAnalyzer.java (added) +++ lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/GermanAnalyzer.java Tue Sep 13 07:03:36 2005 @@ -0,0 +1,48 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the License); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY
svn commit: r280176 - in /lucene/nutch/trunk: conf/ src/java/org/apache/nutch/plugin/
Author: jerome Date: Sun Sep 11 13:28:07 2005 New Revision: 280176 URL: http://svn.apache.org/viewcvs?rev=280176view=rev Log: Automatically loads active plugins dependencies (add a property, default is on) Added: lucene/nutch/trunk/src/java/org/apache/nutch/plugin/CircularDependencyException.java (with props) lucene/nutch/trunk/src/java/org/apache/nutch/plugin/MissingDependencyException.java (with props) Modified: lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=280176r1=280175r2=280176view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Sun Sep 11 13:28:07 2005 @@ -586,8 +586,17 @@ /property property + nameplugin.auto-activation/name + valuefalse/value + descriptionDefines if some plugins that are not activated regarding + the plugin.includes and plugin.excludes properties must be automaticaly + activated if they are needed by some actived plugins. + /description +/property + +property nameplugin.includes/name - valuenutch-extensionpoints|protocol-httpclient|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)/value + valueprotocol-httpclient|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)/value descriptionRegular expression naming plugin directory names to include. Any plugin not matching this expression is excluded. In any case you need at least include the nutch-extensionpoints plugin. By Added: lucene/nutch/trunk/src/java/org/apache/nutch/plugin/CircularDependencyException.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/plugin/CircularDependencyException.java?rev=280176view=auto == --- lucene/nutch/trunk/src/java/org/apache/nutch/plugin/CircularDependencyException.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/plugin/CircularDependencyException.java Sun Sep 11 13:28:07 2005 @@ -0,0 +1,35 @@ +/* +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the License); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.plugin; + + +/** + * codeCircularDependencyException/code will be thrown if a circular + * dependency is detected. + * + * @author Jeacute;rocirc;me Charron + */ +public class CircularDependencyException extends Exception { + + public CircularDependencyException(Throwable cause) { +super(cause); + } + + public CircularDependencyException(String message) { +super(message); + } +} Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/plugin/CircularDependencyException.java -- svn:eol-style = native Added: lucene/nutch/trunk/src/java/org/apache/nutch/plugin/MissingDependencyException.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/plugin/MissingDependencyException.java?rev=280176view=auto == --- lucene/nutch/trunk/src/java/org/apache/nutch/plugin/MissingDependencyException.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/plugin/MissingDependencyException.java Sun Sep 11 13:28:07 2005 @@ -0,0 +1,35 @@ +/* +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the License); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.plugin; + + +/** + * codeMissingDependencyException/code will be thrown if a plugin + * dependency cannot be found. + * + * @author Jeacute;rocirc;me Charron
svn commit: r280179 - in /lucene/nutch/trunk/src/plugin: clustering-carrot2/ creativecommons/ index-basic/ index-more/ languageidentifier/ ontology/ parse-ext/ parse-html/ parse-js/ parse-mp3/ parse-mspowerpoint/ parse-msword/ parse-pdf/ parse-rss/ par...
Author: jerome Date: Sun Sep 11 13:34:12 2005 New Revision: 280179 URL: http://svn.apache.org/viewcvs?rev=280179view=rev Log: Add a dependency to nutch-extensionpoints plugin Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml lucene/nutch/trunk/src/plugin/creativecommons/plugin.xml lucene/nutch/trunk/src/plugin/index-basic/plugin.xml lucene/nutch/trunk/src/plugin/index-more/plugin.xml lucene/nutch/trunk/src/plugin/languageidentifier/plugin.xml lucene/nutch/trunk/src/plugin/ontology/plugin.xml lucene/nutch/trunk/src/plugin/parse-ext/plugin.xml lucene/nutch/trunk/src/plugin/parse-html/plugin.xml lucene/nutch/trunk/src/plugin/parse-js/plugin.xml lucene/nutch/trunk/src/plugin/parse-mp3/plugin.xml lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml lucene/nutch/trunk/src/plugin/parse-rtf/plugin.xml lucene/nutch/trunk/src/plugin/parse-text/plugin.xml lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml lucene/nutch/trunk/src/plugin/protocol-file/plugin.xml lucene/nutch/trunk/src/plugin/protocol-ftp/plugin.xml lucene/nutch/trunk/src/plugin/protocol-http/plugin.xml lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml lucene/nutch/trunk/src/plugin/query-basic/plugin.xml lucene/nutch/trunk/src/plugin/query-more/plugin.xml lucene/nutch/trunk/src/plugin/query-site/plugin.xml lucene/nutch/trunk/src/plugin/query-url/plugin.xml lucene/nutch/trunk/src/plugin/urlfilter-prefix/plugin.xml lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml?rev=280179r1=280178r2=280179view=diff == --- lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml Sun Sep 11 13:34:12 2005 @@ -27,6 +27,10 @@ library name=nekohtml-0.9.2.jar/ /runtime + requires + import plugin=nutch-extensionpoints/ + /requires + extension id=org.apache.nutch.clustering.carrot2 name=Carrot2 Clusterer point=org.apache.nutch.clustering.OnlineClusterer Modified: lucene/nutch/trunk/src/plugin/creativecommons/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/plugin.xml?rev=280179r1=280178r2=280179view=diff == --- lucene/nutch/trunk/src/plugin/creativecommons/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/creativecommons/plugin.xml Sun Sep 11 13:34:12 2005 @@ -11,6 +11,10 @@ /library /runtime + requires + import plugin=nutch-extensionpoints/ + /requires + extension id=org.creativecommons.nutch.CCParseFilter name=Creative Commons Metadata Filter point=org.apache.nutch.parse.HtmlParseFilter Modified: lucene/nutch/trunk/src/plugin/index-basic/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-basic/plugin.xml?rev=280179r1=280178r2=280179view=diff == --- lucene/nutch/trunk/src/plugin/index-basic/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/index-basic/plugin.xml Sun Sep 11 13:34:12 2005 @@ -12,6 +12,10 @@ /library /runtime + requires + import plugin=nutch-extensionpoints/ + /requires + extension id=org.apache.nutch.indexer.basic name=Nutch Basic Indexing Filter point=org.apache.nutch.indexer.IndexingFilter Modified: lucene/nutch/trunk/src/plugin/index-more/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-more/plugin.xml?rev=280179r1=280178r2=280179view=diff == --- lucene/nutch/trunk/src/plugin/index-more/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/index-more/plugin.xml Sun Sep 11 13:34:12 2005 @@ -12,6 +12,10 @@ /library /runtime + requires + import plugin=nutch-extensionpoints/ + /requires + extension id=org.apache.nutch.indexer.more name=Nutch More Indexing Filter point=org.apache.nutch.indexer.IndexingFilter Modified: lucene/nutch/trunk/src/plugin/languageidentifier/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/plugin.xml?rev=280179r1=280178r2=280179view=diff == --- lucene/nutch/trunk/src/plugin/languageidentifier/plugin.xml (original
svn commit: r279286 - /lucene/nutch/trunk/conf/nutch-default.xml
Author: jerome Date: Wed Sep 7 02:57:24 2005 New Revision: 279286 URL: http://svn.apache.org/viewcvs?rev=279286view=rev Log: Includes protocol-httpclient plugin (instead of protocol-http) and parse-js erroneously removed during commit of revision 233492 Modified: lucene/nutch/trunk/conf/nutch-default.xml Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=279286r1=279285r2=279286view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Wed Sep 7 02:57:24 2005 @@ -587,7 +587,7 @@ property nameplugin.includes/name - valuenutch-extensionpoints|protocol-http|urlfilter-regex|parse-(text|html)|index-basic|query-(basic|site|url)/value + valuenutch-extensionpoints|protocol-httpclient|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)/value descriptionRegular expression naming plugin directory names to include. Any plugin not matching this expression is excluded. In any case you need at least include the nutch-extensionpoints plugin. By
svn commit: r279027 - /lucene/nutch/trunk/src/plugin/build.xml
Author: jerome Date: Tue Sep 6 09:02:26 2005 New Revision: 279027 URL: http://svn.apache.org/viewcvs?rev=279027view=rev Log: lib-jakarta-poi added to the list of plugin de deploy/clean Modified: lucene/nutch/trunk/src/plugin/build.xml Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=279027r1=279026r2=279027view=diff == --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Tue Sep 6 09:02:26 2005 @@ -6,6 +6,7 @@ !-- Build deploy all the plugin jars.-- !-- == -- target name=deploy + ant dir=lib-jakarta-poi target=deploy/ ant dir=nutch-extensionpoints target=deploy/ ant dir=protocol-file target=deploy/ ant dir=protocol-ftp target=deploy/ @@ -59,6 +60,7 @@ !-- Clean all of the plugins. -- !-- == -- target name=clean +ant dir=lib-jakarta-poi target=clean/ ant dir=nutch-extensionpoints target=clean/ ant dir=protocol-file target=clean/ ant dir=protocol-ftp target=clean/
svn commit: r278626 - in /lucene/nutch/trunk/src/plugin: ./ parse-zip/ parse-zip/sample/ parse-zip/src/ parse-zip/src/java/ parse-zip/src/java/org/ parse-zip/src/java/org/apache/ parse-zip/src/java/org/apache/nutch/ parse-zip/src/java/org/apache/nutch/...
Author: jerome Date: Sun Sep 4 13:53:49 2005 New Revision: 278626 URL: http://svn.apache.org/viewcvs?rev=278626view=rev Log: NUTCH-53, Parser plugin for Zip files (Rohit Kulkarni) Added: lucene/nutch/trunk/src/plugin/parse-zip/ lucene/nutch/trunk/src/plugin/parse-zip/build.xml (with props) lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml (with props) lucene/nutch/trunk/src/plugin/parse-zip/sample/ lucene/nutch/trunk/src/plugin/parse-zip/sample/test.zip (with props) lucene/nutch/trunk/src/plugin/parse-zip/src/ lucene/nutch/trunk/src/plugin/parse-zip/src/java/ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java (with props) lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java (with props) lucene/nutch/trunk/src/plugin/parse-zip/src/test/ lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/ lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/ lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/ lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/ lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/ lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java (with props) Modified: lucene/nutch/trunk/src/plugin/build.xml Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=278626r1=278625r2=278626view=diff == --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Sun Sep 4 13:53:49 2005 @@ -21,6 +21,7 @@ !-- ant dir=parse-mp3 target=deploy/ -- !-- ant dir=parse-rtf target=deploy/ -- ant dir=parse-ext target=deploy/ + ant dir=parse-zip target=deploy/ ant dir=index-basic target=deploy/ ant dir=index-more target=deploy/ ant dir=query-basic target=deploy/ @@ -48,6 +49,7 @@ !-- ant dir=parse-mp3 target=test/ -- !-- ant dir=parse-rtf target=test/ -- ant dir=parse-ext target=test/ + ant dir=parse-zip target=test/ ant dir=creativecommons target=test/ ant dir=languageidentifier target=test/ ant dir=ontology target=test/ @@ -72,6 +74,7 @@ ant dir=parse-mp3 target=clean/ ant dir=parse-rtf target=clean/ ant dir=parse-ext target=clean/ +ant dir=parse-zip target=clean/ ant dir=index-basic target=clean/ ant dir=index-more target=clean/ ant dir=query-basic target=clean/ Added: lucene/nutch/trunk/src/plugin/parse-zip/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/build.xml?rev=278626view=auto == --- lucene/nutch/trunk/src/plugin/parse-zip/build.xml (added) +++ lucene/nutch/trunk/src/plugin/parse-zip/build.xml Sun Sep 4 13:53:49 2005 @@ -0,0 +1,15 @@ +?xml version=1.0? + +project name=parse-zip default=jar + + import file=../build-plugin.xml/ + + !-- for junit test -- + mkdir dir=${build.test}/data / + copy todir=${build.test}/data + fileset dir=sample + include name=*.zip / + /fileset + /copy + +/project Propchange: lucene/nutch/trunk/src/plugin/parse-zip/build.xml -- svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml?rev=278626view=auto == --- lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml (added) +++ lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml Sun Sep 4 13:53:49 2005 @@ -0,0 +1,24 @@ +?xml version=1.0 encoding=UTF-8? +plugin + id=parse-zip + name=Zip Parse Plug-in + version=1.0.0 + provider-name=nutch.org + + runtime + library name=parse-zip.jar + export name=*/ + /library + /runtime + + extension id=org.apache.nutch.parse.zip + name=ZipParser + point=org.apache.nutch.parse.Parser + + implementation id=org.apache.nutch.parse.zip.ZipParser + class=org.apache.nutch.parse.zip.ZipParser + contentType=application/zip + pathSuffix=zip/ + /extension + +/plugin Propchange: lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml
svn commit: r265794 - in /lucene/nutch/trunk: lib/commons-lang-2.1.jar src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
Author: jerome Date: Thu Sep 1 15:20:51 2005 New Revision: 265794 URL: http://svn.apache.org/viewcvs?rev=265794view=rev Log: NUTCH-65, Handles more modification-date format Added: lucene/nutch/trunk/lib/commons-lang-2.1.jar (with props) Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Added: lucene/nutch/trunk/lib/commons-lang-2.1.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/commons-lang-2.1.jar?rev=265794view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/commons-lang-2.1.jar -- svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=265794r1=265793r2=265794view=diff == --- lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original) +++ lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Thu Sep 1 15:20:51 2005 @@ -53,7 +53,7 @@ import java.util.Enumeration; import java.util.Properties; - +import org.apache.commons.lang.time.DateUtils; /** * Add (or reset) a few metaData properties as respective fields * (if they are available), so that they can be displayed by more.jsp @@ -133,21 +133,37 @@ try { time = HttpDateFormat.toLong(date); } catch (ParseException e) { - // try to parse it as date in alternative format - String date2 = date; - try { -if (date.length() 25 ) date2 = date.substring(0, 25); -DateFormat df = new SimpleDateFormat(EEE, dd MMM HH:mm:ss, Locale.US); -time = df.parse(date2).getTime(); - } catch (Exception e1) { -try { - if (date.length() 24 ) date2 = date.substring(0, 24); - DateFormat df = new SimpleDateFormat(EEE MMM dd HH:mm:ss , Locale.US); - time = df.parse(date2).getTime(); -} catch (Exception e2) { - LOG.warning(url + : can't parse erroneous date: + date); -} - } + // try to parse it as date in alternative format + try { + Date parsedDate = DateUtils.parseDate(date, + new String [] { + EEE MMM dd HH:mm:ss , + EEE MMM dd HH:mm:ss zzz, + EEE, MMM dd HH:mm:ss zzz, + EEE, dd MMM HH:mm:ss zzz, + EEE,dd MMM HH:mm:ss zzz, + EEE, dd MMM HH:mm:sszzz, + EEE, dd MMM HH:mm:ss, + EEE, dd-MMM-yy HH:mm:ss zzz, + /MM/dd HH:mm:ss.SSS zzz, + /MM/dd HH:mm:ss.SSS, + /MM/dd HH:mm:ss zzz, + /MM/dd, + .MM.dd HH:mm:ss, + -MM-dd HH:mm, + MMM dd HH:mm:ss. zzz, + MMM dd HH:mm:ss zzz, + dd.MM. HH:mm:ss zzz, + dd MM HH:mm:ss zzz, + dd.MM.; HH:mm:ss, + dd.MM. HH:mm:ss, + dd.MM. zzz + }); + time = parsedDate.getTime(); + // LOG.warning(url + : parsed date: + date + to:+time); + } catch (Exception e2) { + LOG.warning(url + : can't parse erroneous date: + date); + } } return time; }
svn commit: r264964 - /lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
Author: jerome Date: Wed Aug 31 01:04:52 2005 New Revision: 264964 URL: http://svn.apache.org/viewcvs?rev=264964view=rev Log: No more NullPointerException while logging the doc language if none Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java?rev=264964r1=264963r2=264964view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java Wed Aug 31 01:04:52 2005 @@ -145,8 +145,9 @@ // add the document to the index NutchAnalyzer analyzer = AnalyzerFactory.get(doc.get(lang)); - LOG.info( Indexing [ + doc.getField(url).stringValue() + - ] with analyzer + analyzer + ( + doc.getField(lang).stringValue() + )); + LOG.info( Indexing [ + doc.getField(url).stringValue() + ] + +with analyzer + analyzer + +( + doc.get(lang) + )); //LOG.info( Doc is + doc); writer.addDocument(doc, analyzer); if (count 0 count % LOG_STEP == 0) {
svn commit: r265020 - /lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java
Author: jerome Date: Wed Aug 31 04:38:28 2005 New Revision: 265020 URL: http://svn.apache.org/viewcvs?rev=265020view=rev Log: Fixes some typo (analySer = analyZer) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java?rev=265020r1=265019r2=265020view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java Wed Aug 31 04:38:28 2005 @@ -44,7 +44,7 @@ private final static Map CACHE = new HashMap(); - private final static NutchAnalyzer DEFAULT_ANALYSER = + private final static NutchAnalyzer DEFAULT_ANALYZER = new NutchDocumentAnalyzer(); @@ -60,22 +60,22 @@ /** - * Returns the appropriate [EMAIL PROTECTED] Analyser} implementation given a language - * code. + * Returns the appropriate [EMAIL PROTECTED] NutchAnalyzer analyzer} implementation + * given a language code. * - * pNutchAnalyser extensions should define the attribute lang. The first + * pNutchAnalyzer extensions should define the attribute lang. The first * plugin found whose lang attribute equals the specified lang parameter is * used. If none match, then the [EMAIL PROTECTED] NutchDocumentAnalyzer} is used. */ public static NutchAnalyzer get(String lang) { -NutchAnalyzer analyzer = DEFAULT_ANALYSER; +NutchAnalyzer analyzer = DEFAULT_ANALYZER; Extension extension = getExtension(lang); if (extension != null) { try { analyzer = (NutchAnalyzer) extension.getExtensionInstance(); } catch (PluginRuntimeException pre) { -analyzer = DEFAULT_ANALYSER; +analyzer = DEFAULT_ANALYZER; } } return analyzer;
svn commit: r265503 - in /lucene/nutch/trunk/src: java/org/apache/nutch/clustering/ java/org/apache/nutch/fs/ java/org/apache/nutch/mapReduce/ java/org/apache/nutch/parse/ java/org/apache/nutch/protoc
Author: jerome Date: Wed Aug 31 08:17:11 2005 New Revision: 265503 URL: http://svn.apache.org/viewcvs?rev=265503view=rev Log: Merged 0.7 branch changes 240321:240453 into trunk Modified: lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java lucene/nutch/trunk/src/java/org/apache/nutch/fs/NutchFileSystem.java lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/FileSplit.java lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/MapOutputFile.java lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/RecordReader.java lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/package.html lucene/nutch/trunk/src/java/org/apache/nutch/parse/Parse.java lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolException.java lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ResourceGone.java lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ResourceMoved.java lucene/nutch/trunk/src/java/org/apache/nutch/protocol/RetryLater.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java lucene/nutch/trunk/src/java/org/apache/nutch/util/Daemon.java lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java?rev=265503r1=265502r2=265503view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java Wed Aug 31 08:17:11 2005 @@ -23,8 +23,8 @@ * algorithms. * * pBy the term bonline/b search results clustering we will understand - * a clusterer that works on a set of [EMAIL PROTECTED] Hit}s retrieved for a user's query - * and produces a set of [EMAIL PROTECTED] Clusters} that can be displayed to help + * a clusterer that works on a set of [EMAIL PROTECTED] HitDetails} retrieved for a user's + * query and produces a set of [EMAIL PROTECTED] HitsCluster} that can be displayed to help * the user gain insight in the topics found in the result./p * * pOther clustering options include predefined categories and off-line Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fs/NutchFileSystem.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fs/NutchFileSystem.java?rev=265503r1=265502r2=265503view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/fs/NutchFileSystem.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fs/NutchFileSystem.java Wed Aug 31 08:17:11 2005 @@ -80,8 +80,8 @@ return getNamed(NutchConf.get().get(fs.default.name, local)); } -/** Returns a name for this filesystem, suitable to pass to [EMAIL PROTECTED] - * NutchFileSystem#getNamed(String).*/ +/** Returns a name for this filesystem, suitable to pass to + * [EMAIL PROTECTED] NutchFileSystem#getNamed(String)}.*/ public abstract String getName(); /** Returns a named filesystem. Names are either the string local or a Modified: lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/FileSplit.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/FileSplit.java?rev=265503r1=265502r2=265503view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/FileSplit.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/FileSplit.java Wed Aug 31 08:17:11 2005 @@ -25,9 +25,12 @@ import org.apache.nutch.io.UTF8; import org.apache.nutch.fs.NutchFileSystem; -/** A section of an input file. Returned by [EMAIL PROTECTED] - * InputFormat#getSplits(File[], int)} and passed to - * InputFormat#getRecordReader(FileSplit). */ +/** + * A section of an input file. + * Returned by [EMAIL PROTECTED] InputFormat#getSplits(NutchFileSystem, JobConf, int)} + * and passed to + * [EMAIL PROTECTED] InputFormat#getRecordReader(NutchFileSystem, FileSplit, JobConf)}. + */ public class FileSplit implements Writable { private File file; private long start; Modified: lucene/nutch/trunk/src/java/org/apache/nutch
svn commit: r240254 - in /lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang: HTMLLanguageParser.java LanguageIdentifier.java LanguageIndexingFilter.java LanguageQueryFilter.java NGramProfile.java
Author: jerome Date: Fri Aug 26 07:54:16 2005 New Revision: 240254 URL: http://svn.apache.org/viewcvs?rev=240254view=rev Log: Javadoc updates, corrections on input stream reading Modified: lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java Modified: lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=240254r1=240253r2=240254view=diff == --- lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (original) +++ lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java Fri Aug 26 07:54:16 2005 @@ -23,20 +23,37 @@ import java.util.logging.Logger; import org.apache.nutch.util.LogFormatter; -/** Adds metadata identifying language of document if found - * We could also run statistical analysis here but we'd miss all other formats +/** + * An [EMAIL PROTECTED] org.apache.nutch.parse.HtmlParseFilter} that looks for possible + * indications of content language. + * + * If some indication is found, it is added in the [EMAIL PROTECTED] #META_LANG_NAME} + * attribute of the [EMAIL PROTECTED] org.apache.nutch.parse.ParseData} metadata. + * + * @author Sami Siren + * @author Jerome Charron */ public class HTMLLanguageParser implements HtmlParseFilter { + + /** The language meta data attribute name */ public static final String META_LANG_NAME=X-meta-lang; - public static final Logger LOG = LogFormatter + + private static final Logger LOG = LogFormatter .getLogger(HTMLLanguageParser.class.getName()); /** - * Scan the HTML document looking at possible indications of content languagebr - * li1. html lang attribute (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1) - * li2. meta dc.language (http://dublincore.org/documents/2000/07/16/usageguide/qualified-html.shtml#language) - * li3. meta http-equiv (content-language) (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) - * brOnly the first occurence of language is stored. + * Scan the HTML document looking at possible indications of content language. + * ol + * lihtml lang attribute + * (a href=http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1; + * http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1/a),/li + * limeta dc.language (a href=http://dublincore.org/documents/2000/07/16/usageguide/qualified-html.shtml#language; + * http://dublincore.org/documents/2000/07/16/usageguide/qualified-html.shtml#language/a),/li + * limeta http-equiv (content-language) ( + * a href=http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2; + * http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2/a)./li + * /ol + * Only the first occurence of language is stored. */ public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) { String lang = findLanguage(doc); Modified: lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java URL: http://svn.apache.org/viewcvs/lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?rev=240254r1=240253r2=240254view=diff == --- lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java (original) +++ lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java Fri Aug 26 07:54:16 2005 @@ -20,6 +20,7 @@ import java.io.InputStream; import java.io.IOException; import java.io.BufferedReader; +import java.io.ByteArrayOutputStream; import java.io.FileInputStream; import java.io.InputStreamReader; import java.util.List; @@ -48,6 +49,10 @@ /** + * Identify the language of a content, based on statistical analysis. + * + * @see a href=http://www.w3.org/WAI/ER/IG/ert/iso639.htm;ISO 639 + * Language Codes
svn commit: r240345 - in /lucene/nutch/branches/Release-0.7/src: java/org/apache/nutch/clustering/ java/org/apache/nutch/fs/ java/org/apache/nutch/mapReduce/ java/org/apache/nutch/parse/ java/org/apache/nutch/protocol/ java/org/apache/nutch/searcher/ j...
Author: jerome Date: Fri Aug 26 14:03:19 2005 New Revision: 240345 URL: http://svn.apache.org/viewcvs?rev=240345view=rev Log: NUTCH-37, no more javadoc warnings Modified: lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/clustering/OnlineClusterer.java lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/fs/NutchFileSystem.java lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/mapReduce/FileSplit.java lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/mapReduce/MapOutputFile.java lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/mapReduce/RecordReader.java lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/mapReduce/package.html lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/parse/Parse.java lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/protocol/Content.java lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/protocol/ProtocolException.java lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/protocol/ResourceGone.java lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/protocol/ResourceMoved.java lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/protocol/RetryLater.java lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/searcher/Hits.java lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/segment/SegmentReader.java lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/util/Daemon.java lucene/nutch/branches/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java lucene/nutch/branches/Release-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java lucene/nutch/branches/Release-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java lucene/nutch/branches/Release-0.7/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java Modified: lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/clustering/OnlineClusterer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/clustering/OnlineClusterer.java?rev=240345r1=240344r2=240345view=diff == --- lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/clustering/OnlineClusterer.java (original) +++ lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/clustering/OnlineClusterer.java Fri Aug 26 14:03:19 2005 @@ -23,8 +23,8 @@ * algorithms. * * pBy the term bonline/b search results clustering we will understand - * a clusterer that works on a set of [EMAIL PROTECTED] Hit}s retrieved for a user's query - * and produces a set of [EMAIL PROTECTED] Clusters} that can be displayed to help + * a clusterer that works on a set of [EMAIL PROTECTED] HitDetails} retrieved for a user's + * query and produces a set of [EMAIL PROTECTED] HitsCluster} that can be displayed to help * the user gain insight in the topics found in the result./p * * pOther clustering options include predefined categories and off-line Modified: lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/fs/NutchFileSystem.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/fs/NutchFileSystem.java?rev=240345r1=240344r2=240345view=diff == --- lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/fs/NutchFileSystem.java (original) +++ lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/fs/NutchFileSystem.java Fri Aug 26 14:03:19 2005 @@ -80,8 +80,8 @@ return getNamed(NutchConf.get().get(fs.default.name, local)); } -/** Returns a name for this filesystem, suitable to pass to [EMAIL PROTECTED] - * NutchFileSystem#getNamed(String).*/ +/** Returns a name for this filesystem, suitable to pass to + * [EMAIL PROTECTED] NutchFileSystem#getNamed(String)}.*/ public abstract String getName(); /** Returns a named filesystem. Names are either the string local or a Modified: lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/mapReduce/FileSplit.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/mapReduce/FileSplit.java?rev=240345r1=240344r2=240345view=diff == --- lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/mapReduce/FileSplit.java (original) +++ lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/mapReduce/FileSplit.java Fri Aug 26 14:03:19 2005 @@ -25,9 +25,12 @@ import org.apache.nutch.io.UTF8; import org.apache.nutch.fs.NutchFileSystem; -/** A section of an input file. Returned by [EMAIL PROTECTED] - * InputFormat
svn commit: r240359 - in /lucene/nutch/trunk/src: java/org/apache/nutch/analysis/ java/org/apache/nutch/indexer/ plugin/nutch-extensionpoints/
Author: jerome Date: Fri Aug 26 15:47:04 2005 New Revision: 240359 URL: http://svn.apache.org/viewcvs?rev=240359view=rev Log: Add an analysis extension point Added: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java (with props) lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java (with props) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml Added: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java?rev=240359view=auto == --- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java Fri Aug 26 15:47:04 2005 @@ -0,0 +1,107 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the License); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.analysis; + +// JDK imports +import java.util.HashMap; +import java.util.Map; +import java.util.logging.Logger; + +// Nutch imports +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.ExtensionPoint; +import org.apache.nutch.plugin.PluginRepository; +import org.apache.nutch.plugin.PluginRuntimeException; +import org.apache.nutch.util.LogFormatter; + + +/** + * Creates and caches [EMAIL PROTECTED] NutchAnalyzer} plugins. + * + * @author Jeacute;rocirc;me Charron + */ +public class AnalyzerFactory { + + public final static Logger LOG = + LogFormatter.getLogger(AnalyzerFactory.class.getName()); + + private final static ExtensionPoint X_POINT = + PluginRepository.getInstance() + .getExtensionPoint(NutchAnalyzer.X_POINT_ID); + + private final static Map CACHE = new HashMap(); + + private final static NutchAnalyzer DEFAULT_ANALYSER = +new NutchDocumentAnalyzer(); + + + static { +if (X_POINT == null) { + throw new RuntimeException(x point + NutchAnalyzer.X_POINT_ID + + not found.); +} + } + + + private AnalyzerFactory() {} + + + /** + * Returns the appropriate [EMAIL PROTECTED] Analyser} implementation given a language + * code. + * + * pNutchAnalyser extensions should define the attribute lang. The first + * plugin found whose lang attribute equals the specified lang parameter is + * used. If none match, then the [EMAIL PROTECTED] NutchDocumentAnalyzer} is used. + */ + public static NutchAnalyzer get(String lang) { + +NutchAnalyzer analyzer = DEFAULT_ANALYSER; +Extension extension = getExtension(lang); +if (extension != null) { +try { +analyzer = (NutchAnalyzer) extension.getExtensionInstance(); +} catch (PluginRuntimeException pre) { +analyzer = DEFAULT_ANALYSER; +} +} +return analyzer; + } + + private static Extension getExtension(String lang) { + +Extension extension = (Extension) CACHE.get(lang); +if (extension == null) { + extension = findExtension(lang); + CACHE.put(lang, extension); +} +return extension; + } + + private static Extension findExtension(String lang) { + +if (lang != null) { + Extension[] extensions = X_POINT.getExtentens(); + for (int i=0; iextensions.length; i++) { +if (lang.equals(extensions[i].getAttribute(lang))) { + return extensions[i]; +} + } +} +return null; + } + +} Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java -- svn:eol-style = native Added: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java?rev=240359view=auto == --- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java Fri Aug 26 15:47:04 2005
svn commit: r233492 - in /lucene/nutch/trunk: conf/ src/plugin/ src/plugin/clustering-carrot2/ src/plugin/creativecommons/ src/plugin/index-basic/ src/plugin/index-more/ src/plugin/languageidentifier/ src/plugin/nutch-extensionpoints/ src/plugin/nutch-...
Author: jerome Date: Fri Aug 19 08:55:46 2005 New Revision: 233492 URL: http://svn.apache.org/viewcvs?rev=233492view=rev Log: NUTCH-10, extension points defined only once (Stefan Grroschupf) Added: lucene/nutch/trunk/src/plugin/nutch-extensionpoints/ lucene/nutch/trunk/src/plugin/nutch-extensionpoints/build.xml (with props) lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml (with props) lucene/nutch/trunk/src/plugin/nutch-extensionpoints/src/ lucene/nutch/trunk/src/plugin/nutch-extensionpoints/src/java/ Modified: lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml lucene/nutch/trunk/src/plugin/creativecommons/plugin.xml lucene/nutch/trunk/src/plugin/index-basic/plugin.xml lucene/nutch/trunk/src/plugin/index-more/plugin.xml lucene/nutch/trunk/src/plugin/languageidentifier/plugin.xml lucene/nutch/trunk/src/plugin/ontology/plugin.xml lucene/nutch/trunk/src/plugin/parse-ext/plugin.xml lucene/nutch/trunk/src/plugin/parse-html/plugin.xml lucene/nutch/trunk/src/plugin/parse-js/plugin.xml lucene/nutch/trunk/src/plugin/parse-mp3/plugin.xml lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml lucene/nutch/trunk/src/plugin/parse-rtf/plugin.xml lucene/nutch/trunk/src/plugin/parse-text/plugin.xml lucene/nutch/trunk/src/plugin/protocol-file/plugin.xml lucene/nutch/trunk/src/plugin/protocol-ftp/plugin.xml lucene/nutch/trunk/src/plugin/protocol-http/plugin.xml lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml lucene/nutch/trunk/src/plugin/query-basic/plugin.xml lucene/nutch/trunk/src/plugin/query-more/plugin.xml lucene/nutch/trunk/src/plugin/query-site/plugin.xml lucene/nutch/trunk/src/plugin/query-url/plugin.xml lucene/nutch/trunk/src/plugin/urlfilter-prefix/plugin.xml lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=233492r1=233491r2=233492view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Fri Aug 19 08:55:46 2005 @@ -587,9 +587,10 @@ property nameplugin.includes/name - valueprotocol-httpclient|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)/value + valuenutch-extensionpoints|protocol-http|urlfilter-regex|parse-(text|html)|index-basic|query-(basic|site|url)/value descriptionRegular expression naming plugin directory names to - include. Any plugin not matching this expression is excluded. By + include. Any plugin not matching this expression is excluded. + In any case you need at least include the nutch-extensionpoints plugin. By default Nutch includes crawling just HTML and plain text via HTTP, and basic indexing and search plugins. /description Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=233492r1=233491r2=233492view=diff == --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Fri Aug 19 08:55:46 2005 @@ -6,6 +6,7 @@ !-- Build deploy all the plugin jars.-- !-- == -- target name=deploy + ant dir=nutch-extensionpoints target=deploy/ ant dir=protocol-file target=deploy/ ant dir=protocol-ftp target=deploy/ ant dir=protocol-http target=deploy/ @@ -54,6 +55,7 @@ !-- Clean all of the plugins. -- !-- == -- target name=clean +ant dir=nutch-extensionpoints target=clean/ ant dir=protocol-file target=clean/ ant dir=protocol-ftp target=clean/ ant dir=protocol-http target=clean/ Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml?rev=233492r1=233491r2=233492view=diff == --- lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml Fri Aug 19 08:55:46 2005 @@ -5,10 +5,6 @@ version=0.9.0 provider-name=carrot2.sourceforge.net - extension-point - id=org.apache.nutch.clustering.OnlineClusterer - name=Nutch Online Search Results Clustering Plugin/ - runtime library name=clustering-carrot2.jar export name=*/ Modified: lucene/nutch
svn commit: r233544 - /lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java
Author: jerome Date: Fri Aug 19 12:26:14 2005 New Revision: 233544 URL: http://svn.apache.org/viewcvs?rev=233544view=rev Log: Correction in LanguageIdentifier unit test Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java?rev=233544r1=233543r2=233544view=diff == --- lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java (original) +++ lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java Fri Aug 19 12:26:14 2005 @@ -223,7 +223,7 @@ String testLine = null; while((testLine = testFile.readLine()) != null) { testLine = testLine.trim(); -if (testLine.length() 64) { +if (testLine.length() 256) { lang = idfr.identify(testLine); assertEquals(tokens[1], lang); }
svn commit: r233559 - in /lucene/nutch/trunk/src: java/org/apache/nutch/parse/ plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ plugin/parse-msword/src/java/org/apache/nutch/parse/msword/ plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/ plugi...
Author: jerome Date: Fri Aug 19 14:15:02 2005 New Revision: 233559 URL: http://svn.apache.org/viewcvs?rev=233559view=rev Log: * Add utility to extract urls from plain text (Stephan Strittmatter) * Uses the OutlinkExtractor in parse plugins PDF, MSWord, Text, RTF, Ext Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java (with props) lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java (with props) Modified: lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?rev=233559view=auto == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java Fri Aug 19 14:15:02 2005 @@ -0,0 +1,227 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the License); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +import java.util.ArrayList; +import java.util.List; +import java.util.logging.Logger; + +import org.apache.nutch.util.LogFormatter; +import org.apache.oro.text.regex.MatchResult; +import org.apache.oro.text.regex.Pattern; +import org.apache.oro.text.regex.PatternCompiler; +import org.apache.oro.text.regex.PatternMatcher; +import org.apache.oro.text.regex.PatternMatcherInput; +import org.apache.oro.text.regex.Perl5Compiler; +import org.apache.oro.text.regex.Perl5Matcher; + +/** + * Extractor to extract [EMAIL PROTECTED] org.apache.nutch.parse.Outlink}s + * / URLs from plain text using Regular Expressions. + * + * @see a + * href=http://wiki.java.net/bin/view/Javapedia/RegularExpressions;Comparison + * of different regexp-Implementations /a + * @see a href=http://regex.info/java.html;Overview about Java Regexp APIs + * /a + * + * @author Stephan Strittmatter - http://www.sybit.de + * @version 1.0 + * @since 0.7 + */ +public class OutlinkExtractor { + private static final Logger LOG = LogFormatter + .getLogger(OutlinkExtractor.class.getName()); + + /** + * Regex pattern to get URLs within a plain text. + * + * @see a + * href=http://www.truerwords.net/articles/ut/urlactivation.html;http://www.truerwords.net/articles/ut/urlactivation.html + * /a + */ + private static final String URL_PATTERN = + ([A-Za-z][A-Za-z0-9+.-]+:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@~=%-]*))?); + + /** + * Extracts codeOutlink/code from given plain text. + * + * @param plainText the plain text from wich URLs should be extracted. + * + * @return Array of codeOutlink/codes within found in plainText + */ + public static Outlink[] getOutlinks(final String plainText) { +return OutlinkExtractor.getOutlinks(plainText, ); + } + + /** + * Extracts codeOutlink/code from given plain text and adds anchor + * to the extracted codeOutlink/codes + * + * @param plainText the plain text from wich URLs should be extracted. + * @param anchorthe anchor of the url + * + * @return Array of codeOutlink/codes within found in plainText + */ + public static Outlink[] getOutlinks(final String plainText, String anchor) { + +final List outlinks = new ArrayList(); + +try { + final PatternCompiler cp = new Perl5Compiler(); + final Pattern pattern = cp.compile(URL_PATTERN, + Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK + | Perl5Compiler.MULTILINE_MASK); + final PatternMatcher matcher = new Perl5Matcher(); + + final PatternMatcherInput input = new PatternMatcherInput(plainText); + + MatchResult result; + String url; + + //loop the matches + while (matcher.contains(input, pattern)) { +result = matcher.getMatch
svn commit: r233312 - in /lucene/nutch/trunk: docs/ca/ docs/de/ docs/en/ docs/es/ docs/fi/ docs/fr/ docs/hu/ docs/jp/ docs/ms/ docs/nl/ docs/pl/ docs/pt/ docs/sv/ docs/th/ docs/zh/ src/web/jsp/ src/web/style/
Author: jerome Date: Thu Aug 18 05:17:22 2005 New Revision: 233312 URL: http://svn.apache.org/viewcvs?rev=233312view=rev Log: Gives focus to search query input Modified: lucene/nutch/trunk/docs/ca/about.html lucene/nutch/trunk/docs/ca/help.html lucene/nutch/trunk/docs/ca/search.html lucene/nutch/trunk/docs/de/about.html lucene/nutch/trunk/docs/de/help.html lucene/nutch/trunk/docs/de/search.html lucene/nutch/trunk/docs/en/about.html lucene/nutch/trunk/docs/en/help.html lucene/nutch/trunk/docs/en/search.html lucene/nutch/trunk/docs/es/about.html lucene/nutch/trunk/docs/es/help.html lucene/nutch/trunk/docs/es/search.html lucene/nutch/trunk/docs/fi/about.html lucene/nutch/trunk/docs/fi/help.html lucene/nutch/trunk/docs/fi/search.html lucene/nutch/trunk/docs/fr/about.html lucene/nutch/trunk/docs/fr/search.html lucene/nutch/trunk/docs/hu/about.html lucene/nutch/trunk/docs/hu/help.html lucene/nutch/trunk/docs/hu/search.html lucene/nutch/trunk/docs/jp/about.html lucene/nutch/trunk/docs/jp/help.html lucene/nutch/trunk/docs/jp/search.html lucene/nutch/trunk/docs/ms/about.html lucene/nutch/trunk/docs/ms/help.html lucene/nutch/trunk/docs/ms/search.html lucene/nutch/trunk/docs/nl/about.html lucene/nutch/trunk/docs/nl/help.html lucene/nutch/trunk/docs/nl/search.html lucene/nutch/trunk/docs/pl/about.html lucene/nutch/trunk/docs/pl/help.html lucene/nutch/trunk/docs/pl/search.html lucene/nutch/trunk/docs/pt/about.html lucene/nutch/trunk/docs/pt/help.html lucene/nutch/trunk/docs/pt/search.html lucene/nutch/trunk/docs/sv/about.html lucene/nutch/trunk/docs/sv/help.html lucene/nutch/trunk/docs/sv/search.html lucene/nutch/trunk/docs/th/about.html lucene/nutch/trunk/docs/th/help.html lucene/nutch/trunk/docs/th/search.html lucene/nutch/trunk/docs/zh/about.html lucene/nutch/trunk/docs/zh/help.html lucene/nutch/trunk/docs/zh/search.html lucene/nutch/trunk/src/web/jsp/search.jsp lucene/nutch/trunk/src/web/style/nutch-page.xsl Modified: lucene/nutch/trunk/docs/ca/about.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/docs/ca/about.html?rev=233312r1=233311r2=233312view=diff == --- lucene/nutch/trunk/docs/ca/about.html (original) +++ lucene/nutch/trunk/docs/ca/about.html Thu Aug 18 05:17:22 2005 @@ -19,8 +19,12 @@ /style link type=image/x-icon href=../img/favicon.ico rel=icon link type=image/x-icon href=../img/favicon.ico rel=shortcut icon +script type=text/javascript +!-- +function queryfocus() { document.search.query.focus(); } +// --/script /head -body +body onLoad=queryfocus(); !--This file is automatically generated. Do not edit!-- table cellspacing=0 cellpadding=0 border=0 width=635 tr Modified: lucene/nutch/trunk/docs/ca/help.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/docs/ca/help.html?rev=233312r1=233311r2=233312view=diff == --- lucene/nutch/trunk/docs/ca/help.html (original) +++ lucene/nutch/trunk/docs/ca/help.html Thu Aug 18 05:17:22 2005 @@ -19,8 +19,12 @@ /style link type=image/x-icon href=../img/favicon.ico rel=icon link type=image/x-icon href=../img/favicon.ico rel=shortcut icon +script type=text/javascript +!-- +function queryfocus() { document.search.query.focus(); } +// --/script /head -body +body onLoad=queryfocus(); !--This file is automatically generated. Do not edit!-- table cellspacing=0 cellpadding=0 border=0 width=635 tr Modified: lucene/nutch/trunk/docs/ca/search.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/docs/ca/search.html?rev=233312r1=233311r2=233312view=diff == --- lucene/nutch/trunk/docs/ca/search.html (original) +++ lucene/nutch/trunk/docs/ca/search.html Thu Aug 18 05:17:22 2005 @@ -19,8 +19,12 @@ /style link type=image/x-icon href=../img/favicon.ico rel=icon link type=image/x-icon href=../img/favicon.ico rel=shortcut icon +script type=text/javascript +!-- +function queryfocus() { document.search.query.focus(); } +// --/script /head -body +body onLoad=queryfocus(); !--This file is automatically generated. Do not edit!-- table cellspacing=0 cellpadding=0 border=0 width=635 tr Modified: lucene/nutch/trunk/docs/de/about.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/docs/de/about.html?rev=233312r1=233311r2=233312view=diff == --- lucene/nutch/trunk/docs/de/about.html (original) +++ lucene/nutch/trunk/docs/de/about.html Thu Aug 18 05:17:22 2005 @@ -19,8 +19,12 @@ /style link type=image/x-icon href=../img/favicon.ico rel=icon link type=image/x-icon href=../img/favicon.ico rel=shortcut icon +script type=text/javascript +!-- +function
svn commit: r233140 - in /lucene/nutch/trunk: site/credits.html site/credits.pdf src/site/src/documentation/content/xdocs/credits.xml
Author: jerome Date: Wed Aug 17 02:39:29 2005 New Revision: 233140 URL: http://svn.apache.org/viewcvs?rev=233140view=rev Log: First commit test - Add myself to the list of committers Modified: lucene/nutch/trunk/site/credits.html lucene/nutch/trunk/site/credits.pdf lucene/nutch/trunk/src/site/src/documentation/content/xdocs/credits.xml Modified: lucene/nutch/trunk/site/credits.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/site/credits.html?rev=233140r1=233139r2=233140view=diff == --- lucene/nutch/trunk/site/credits.html (original) +++ lucene/nutch/trunk/site/credits.html Wed Aug 17 02:39:29 2005 @@ -201,6 +201,10 @@ liMike Cafarella/li li +a href=http://www.frutch.org/;Jeacute;rocirc;me Charron/a +/li + +li a href=http://www.nutch.org/blog/cutting.html;Doug Cutting/a /li @@ -214,7 +218,7 @@ /div -a name=N1002A/aa name=Friends/a +a name=N1002F/aa name=Friends/a h2 class=h3Friends/h2 div class=section ul @@ -245,7 +249,7 @@ /div -a name=N10074/aa name=Sponsors/a +a name=N10079/aa name=Sponsors/a h2 class=h3Sponsors/h2 div class=section ul Modified: lucene/nutch/trunk/site/credits.pdf URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/site/credits.pdf?rev=233140r1=233139r2=233140view=diff == --- lucene/nutch/trunk/site/credits.pdf (original) +++ lucene/nutch/trunk/site/credits.pdf Wed Aug 17 02:39:29 2005 @@ -58,10 +58,10 @@ endobj 14 0 obj - /Length 2181 /Filter [ /ASCII85Decode /FlateDecode ] + /Length 2288 /Filter [ /ASCII85Decode /FlateDecode ] stream -Gatm=968iGAII3Ya:;i#YfhoOM/V)S]5Y)Y3kI`l5jUFP$%=LP*?^Uf69OMYp)[EMAIL PROTECTED]:^b0dHd)GtMr)-^OfoC3iS2o/87PfD^,B+9JG'FY-@([EMAIL PROTECTED];oqY/ejPUi@@iL!tqemI#,'u1B=:]n$76+QB%KCD,TDs$ZJI,]C9!L(O]IBprnI_Kj(L[lSq,;OPn;\3M%tbZ`VMH;JG1E/iiU5n(FG]q:#/T:)*Zh0uYAL6)[EMAIL PROTECTED]*0K#D`cBALg]R9eV:[EMAIL PROTECTED](b]W;@EU`1)n,3k_FPKLr9\`u^A;6^@S?7]YoAfZBDRZs9D)#DPBsp'PN7-8Pg`Y;9,*!`$*qb^%!Hq(OBEPH$E,3ecs0]1_8%dh;u[EPUV8PA%ct[EMAIL PROTECTED],[EMAIL PROTECTED])(91nb1f'UPsh.:QgcmnI124ob902[%qe8M!b$#YO+14odl.qqQ?$D+8jAI-:\k-qMW*_)b;?([EMAIL PROTECTED]j5Y60#i:7HO=qseDCq/lejK^O?'6b[EMAIL PROTECTED],hKY-UP1-c\#u6Yq2o7L)i(j4MeiQg#(ORZUYuH!pC479up((XAY-EXo8!iPkqM0]H+TsM[HjHH0[a#r+8K7#?CBELoslU#jHR62t,0hui;gC*n[KEUl/fgkT+'mTOTPsscQlriB6t3@'D[r?+q,`c\Q;([EMAIL PROTECTED]'qDc$WTk--ji/Z8A2W\!uKD,RlnelZ`CXD/[EMAIL PROTECTED],th6Q*'p!jjQtVsQcl$ikl8:r.l0)YZ+d88_bH6u#81#?c,iENVKYlptAqnegF[kUl5K/[J+,[EMAIL PROTECTED],9;[EMAIL PROTECTED]@[TMK;H!Wsn!9aL#Hk!,L/?[EMAIL PROTECTED]@3;R*)G3V-*Z_,3-.;bSLt%EB:lG;Lc/:L;l'1OPhGP/%9.nMl,t=5;9NWH[hdYTpGJOq-MEgFk+0D^/in?c%%dUmtRa=k9aAh?5(W`o,W^ipu.*4O*HQ[b8%3*U3-2hlF1MnqS.B1NJR:C/^pQ`;([EMAIL PROTECTED]95;VWt[WZ$b#jF\160K4MtI\j?*4CITY]]g4r15)-,B#bmG3K!nTLf*/!=,;81bS'[EMAIL PROTECTED]:D=8\E?lce%+4!tcIJSCFDI8d]c7=k*\DQBQHh.jDK8#BWISlb^VJR$`V7/[EMAIL PROTECTED]C\PWgo+/[EMAIL PROTECTED]@q[EMAIL PROTECTED][1NeZ:.W5:6j3IIMiu:SMRB3,Ka(FR:m$?lWIbsJfeP5D:PMCi*/#'pg_9F#m*:p9Q[EMAIL PROTECTED]/671fcgaq,87rJMqb:@:MKLCUjU[b+A'qdIh*'--Hn4Rr]ibVQ):b;[EMAIL PROTECTED],R\EkK'#CAtfA6K)YaVKHo:#a4g;L]SuD5unu\9#_9t3)CXUeoY^Y#G1*`!FCAd]SegE8@RkGH;WnQj5pgnPP?5$S*YCZQ;__eY4n?1n!t1mqK6fUENR-V'o6i5I0e5SJ1]-7Q9grD.AIq*dM#VJnmDkS?C]Fs3\G60KbEFXk?#,CVZK;.V`q;d4U1dNg6j!rk,^m,Z=?pTDEXJ-/q?DNk!:7/[EMAIL PROTECTED]`-Gb+fF'bACIkT-!_g!'[EMAIL PROTECTED]q^VcnirDE,LJDUA=UGeg!0,4VEu;t8a/KU/q9+d)cmY:rb(Zpi3^L,uQgF5p%U7FNVi:tBeTE*e(rc)rHhQA\P'9,[EMAIL PROTECTED]M#as'kM$%2-W^a7JO?,qXk*]D+9bFA5CGdW[QT3EZggVRtUORC.PdRS[YTNJu/rc)Y9+-\sm*(=uM8W3`Q17*gM]ETCVCV_UKBn,/92_S=D=r/dA5F(lZK(HJWsnb!j8B2I*W!KCQURGhbfF?7m3:1pN12.IlPqjNc.3jJ_EJ/[EMAIL PROTECTED] +Gatm=Bci#:XAWTJ(Yt%ne1![EMAIL PROTECTED]7[C1qt/sg.%9W%d#;R+io5E1IcX_^[[EMAIL PROTECTED]'=a[K^7:?_;ArHUm\(3X\ebIg/Q6]=)aF*U1qTHIbrM)PVIB0A(q0h'#S2l61S8mR;rUj_Y,;`cVi1I[q00`'V[K[5=M?KPhQ+6X%jJf!!Jb7'@L]_d(3$36'[EMAIL PROTECTED])BL_ED[Ih9CD:?/4m?Y#+\SePa_G^a?ClN`3or4rt_,5WYPF.cL:dc+M[fCg-%fBX[UXn'TH8!lYSnDa`'o*q)4Up+=IQ1OJ`Uf/RkI]Qq?KU!O^:(@qFoLlRe2ojIeR\-al\P,F*;C]4HJh3[!4?1I^kZalD*5!;WQ[WGo+83!I,+PA8'5VD2n??bZF8lpf'[EMAIL PROTECTED];s+LNF]5N,bEpA%lWE%4'*;U9#aSi'm[-.5A-H6bHp#:4CN1M*1*Oe6([EMAIL PROTECTED]O'aUO99ZN_3.''dc4PBBuq`qn6t_:[EMAIL PROTECTED])I[*DY?WB8:8lF:3(Q'Eiu(0eeh@(3ncMFsO/?m#7-jd.A!.o/\F!+)7QU'eBEoFi,j15-SRk%Xn[;O6YU7cL%:X$o0FC[EMAIL PROTECTED])Hi$q79(9)[EMAIL PROTECTED]oDaOf/AeP;+Q:UM3n8WLlFiYi9Jk:[EMAIL PROTECTED]nnQt)*':[EMAIL PROTECTED])UA7`+q;!D$$(MGtk\^m]L(L`9iOofINi5[7i?'^H!G*TlTQ\8;LYn]RK`Y;#C9lhP[4PdoFkJHKGO)OMJ\d]`HcrHSXjlXoRTL)#?RJWP7NZYS67KVK-2E`NM.goZh\C0goCL!bgW\:(kN.pR%DA/s+IG:CG4]f0a;[EMAIL PROTECTED]:A\8cbcoQ99[KFo4!!j(LLt`GKgh0CgeI;Wp\!9XAB!XF*/Y=2?X/0+T)[EMAIL PROTECTED]:h2m(CJZj30/Q2]IVn5PT1/.,WIDUY3r`[(+3=D?He3eDc2J=AB0%[9=V%je_q]s;V-,Ei2ZTHI+(;G`V#^809@;6``JRCf^rg(7)O4upqBO\A-7ftiRkN`:PIB0B_n#6Z$U6t=\HKlhO/nERHl\D[kHnDV,5=`h:O\KR:+hn.pR;MU4Hp5dZH