svn commit: r412399 - /lucene/nutch/trunk/conf/mime-types.xml

2006-06-07 Thread jerome
Author: jerome
Date: Wed Jun  7 06:07:27 2006
New Revision: 412399

URL: http://svn.apache.org/viewvc?rev=412399view=rev
Log:
NUTCH-275 : Remove the magic resolution for xml content-type

Modified:
lucene/nutch/trunk/conf/mime-types.xml

Modified: lucene/nutch/trunk/conf/mime-types.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/mime-types.xml?rev=412399r1=412398r2=412399view=diff
==
--- lucene/nutch/trunk/conf/mime-types.xml (original)
+++ lucene/nutch/trunk/conf/mime-types.xml Wed Jun  7 06:07:27 2006
@@ -693,7 +693,7 @@
 mime-type name=text/xml
description=Extensible Markup Language File
 extxml/extextxsl/ext
-magic offset=0 value=lt;?xml/
+!--magic offset=0 value=lt;?xml/--
 /mime-type
 
 mime-type name=text/x-setext




svn commit: r412577 - in /lucene/nutch/trunk/src/test/org/apache/nutch/util/mime: mime-types.txt test.xml

2006-06-07 Thread jerome
Author: jerome
Date: Wed Jun  7 15:06:53 2006
New Revision: 412577

URL: http://svn.apache.org/viewvc?rev=412577view=rev
Log:
NUTCH-275 : Remove unit test for magic based content type guessing for xml

Removed:
lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/test.xml
Modified:
lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/mime-types.txt

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/mime-types.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/mime-types.txt?rev=412577r1=412576r2=412577view=diff
==
--- lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/mime-types.txt 
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/mime-types.txt Wed 
Jun  7 15:06:53 2006
@@ -136,7 +136,7 @@
 text/tab-separated-values;tsv
 text/vnd.wap.wml;wml
 text/vnd.wap.wmlscript;wmls
-text/xml;xml;test.xml
+text/xml;xml
 text/xml;xsl
 text/x-setext;etx
 video/mpeg;mpg




svn commit: r412582 - /lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java

2006-06-07 Thread jerome
Author: jerome
Date: Wed Jun  7 15:19:08 2006
New Revision: 412582

URL: http://svn.apache.org/viewvc?rev=412582view=rev
Log:
NUTCH-301 : CommonTerms are cached in the Configuration

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java?rev=412582r1=412581r2=412582view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java Wed 
Jun  7 15:19:08 2006
@@ -37,7 +37,10 @@
   private static final Logger LOG =
 LogFormatter.getLogger(org.apache.nutch.analysis.CommonGrams);
   private static final char SEPARATOR = '-';
-  private HashMap COMMON_TERMS = new HashMap();
+  /** The key used to cache commonTerms in Configuration */
+  private static final String KEY = CommonGrams.class.getName();
+
+  private HashMap commonTerms = new HashMap();
   
   /**
* The constructor.
@@ -135,7 +138,13 @@
 
   /** Construct using the provided config file. */
   private void init(Configuration conf) {
+// First, try to retrieve some commonTerms cached in configuration.
+commonTerms = (HashMap) conf.getObject(KEY);
+if (commonTerms != null) { return; }
+
+// Otherwise, read the terms.file
 try {
+  commonTerms = new HashMap();
   Reader reader = conf.getConfResourceAsReader
 (conf.get(analysis.common.terms.file));
   BufferedReader in = new BufferedReader(reader);
@@ -160,13 +169,14 @@
 while ((token = ts.next()) != null) {
   gram = gram + SEPARATOR + token.termText();
 }
-HashSet table = (HashSet)COMMON_TERMS.get(field);
+HashSet table = (HashSet)commonTerms.get(field);
 if (table == null) {
   table = new HashSet();
-  COMMON_TERMS.put(field, table);
+  commonTerms.put(field, table);
 }
 table.add(gram);
   }
+  conf.setObject(KEY, commonTerms);
 } catch (IOException e) {
   throw new RuntimeException(e.toString());
 }
@@ -175,7 +185,7 @@
   /** Construct a token filter that inserts n-grams for common terms.  For use
* while indexing documents.  */
   public TokenFilter getFilter(TokenStream ts, String field) {
-return new Filter(ts, (HashSet)COMMON_TERMS.get(field));
+return new Filter(ts, (HashSet)commonTerms.get(field));
   }
 
   /** Utility to convert an array of Query.Terms into a token stream. */




svn commit: r411926 - in /lucene/nutch/trunk/src/plugin/lib-http/src: java/org/apache/nutch/protocol/http/api/RobotRulesParser.java test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java

2006-06-05 Thread jerome
Author: jerome
Date: Mon Jun  5 14:43:42 2006
New Revision: 411926

URL: http://svn.apache.org/viewvc?rev=411926view=rev
Log:
NUTCH-298 : No more NPE if a 404 for a robots.txt + some unit tests

Modified:

lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java

lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java

Modified: 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?rev=411926r1=411925r2=411926view=diff
==
--- 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
 Mon Jun  5 14:43:42 2006
@@ -70,8 +70,8 @@
* file, and can test paths against those rules.
*/
   public static class RobotRuleSet {
-ArrayList tmpEntries;
-RobotsEntry[] entries;
+ArrayList tmpEntries = new ArrayList();
+RobotsEntry[] entries = null;
 long expireTime;
 
 /**

Modified: 
lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java?rev=411926r1=411925r2=411926view=diff
==
--- 
lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
 Mon Jun  5 14:43:42 2006
@@ -25,7 +25,29 @@
   private static final String CR= \r;
   private static final String CRLF= \r\n;
   
-
+  private static final boolean[] ACCEPT_ALL = {
+true,   // /a, 
+true,   // /a/,
+true,   // /a/bloh/foo.html
+true,   // /b, 
+true,   // /b/a,   
+true,   // /b/a/index.html,
+true,   // /b/b/foo.html,  
+true,   // /c, 
+true,   // /c/a,   
+true,   // /c/a/index.html,
+true,   // /c/b/foo.html,  
+true,   // /d, 
+true,   // /d/a,   
+true,   // /e/a/index.html,
+true,   // /e/d,   
+true,   // /e/d/foo.html,  
+true,   // /e/doh.html,
+true,   // /f/index.html,  
+true,   // /foo/bar.html,  
+true,   // /f/,
+  };
+  
   private static final String[] ROBOTS_STRINGS= new String[] {
 User-Agent: Agent1 #foo + CR 
 + Disallow: /a + CR 
@@ -40,6 +62,7 @@
 +  + CR 
 + User-Agent: * + CR 
 + Disallow: /foo/bar/ + CR,
+null  // Used to test EMPTY_RULES
   };
 
   private static final String[] AGENT_STRINGS= new String[] {
@@ -57,7 +80,14 @@
   false,
   false,
   true,
-}
+},
+{ 
+  false, 
+  false,
+  false,
+  false,
+  true,
+}
   };
 
   private static final String[] TEST_PATHS= new String[] {
@@ -195,6 +225,13 @@
false,  // /foo/bar.html,  
true,   // /f/,  
   }
+},
+{ // ROBOTS_STRINGS[1]
+  ACCEPT_ALL, // Agent 1
+  ACCEPT_ALL, // Agent 2
+  ACCEPT_ALL, // Agent 3
+  ACCEPT_ALL, // Agent 4
+  ACCEPT_ALL, // Agent 5
 }
   };
  
@@ -233,7 +270,9 @@
 for (int i= 1; i  agents.length; i++)
   agentsString= agentsString + , + agents[i];
 RobotRulesParser p= new RobotRulesParser(agents);
-RobotRuleSet rules= p.parseRules(ROBOTS_STRINGS[robotsString].getBytes());
+RobotRuleSet rules= p.parseRules(ROBOTS_STRINGS[robotsString] != null
+ ? ROBOTS_STRINGS[robotsString].getBytes()
+ : null);
 for (int i= 0; i  paths.length; i++) {
   assertTrue(testing robots file +robotsString+, on agents (
 + agentsString + ), and path  + TEST_PATHS[i] + ; got  
@@ -243,4 +282,6 @@
 }
   }
 
+
+  
 }




svn commit: r411935 - /lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java

2006-06-05 Thread jerome
Author: jerome
Date: Mon Jun  5 15:02:35 2006
New Revision: 411935

URL: http://svn.apache.org/viewvc?rev=411935view=rev
Log:
NucthAnalyzer is now Configurable (Teruhiko Kurosaka)

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java?rev=411935r1=411934r2=411935view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java 
Mon Jun  5 15:02:35 2006
@@ -22,6 +22,10 @@
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 
+// Hadoop imports
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+
 // Nutch imports
 import org.apache.nutch.plugin.Pluggable;
 
@@ -34,16 +38,37 @@
  * @author Jeacute;rocirc;me Charron
  */
 public abstract class NutchAnalyzer extends Analyzer
-implements Pluggable {
+implements Configurable, Pluggable {
 
   /** The name of the extension point. */
   final static String X_POINT_ID = NutchAnalyzer.class.getName();
 
+  /** The current Configuration */
+  protected Configuration conf = null;
+
   
   /**
* Creates a TokenStream which tokenizes all the text in the provided Reader.
*/
   public abstract TokenStream tokenStream(String fieldName, Reader reader);
 
-  
+
+  /* - *
+   * implementation:Configurable *
+   * - */
+
+  // Inherited Javadoc
+  public void setConf(Configuration conf) {
+this.conf = conf;
+  }
+
+  // Inherited Javadoc
+  public Configuration getConf() {
+return this.conf;
+  }
+
+  /* -- *
+   * /implementation:Configurable *
+   * -- */
+
 }




svn commit: r406053 - in /lucene/nutch/trunk/src: java/org/apache/nutch/clustering/ java/org/apache/nutch/parse/ java/org/apache/nutch/scoring/ plugin/protocol-httpclient/src/java/org/apache/nutch/pro

2006-05-13 Thread jerome
Author: jerome
Date: Sat May 13 02:14:53 2006
New Revision: 406053

URL: http://svn.apache.org/viewcvs?rev=406053view=rev
Log:
Fix Javadoc Warnings

Modified:

lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java

lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java

lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java?rev=406053r1=406052r2=406053view=diff
==
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java
 (original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java
 Sat May 13 02:14:53 2006
@@ -60,7 +60,7 @@
 
   /**
   * @return Returns the online clustering extension specified
-  * in nutch configuration (key name in this field: [EMAIL PROTECTED] 
#CONFIG_FIELD_NAME}). 
+  * in nutch configuration (key name is 
codeextension.clustering.extension-name/code). 
   * If the name is empty (no preference), the first available clustering 
extension is
   * returned.
   */

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java?rev=406053r1=406052r2=406053view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java 
Sat May 13 02:14:53 2006
@@ -46,7 +46,7 @@
  * @author mattmann
  * @version 1.0
  */
-public class ParsePluginsReader {
+class ParsePluginsReader {
   
   /* our log stream */
   public static final Logger LOG = 
@@ -66,9 +66,9 @@
   
   /**
* Reads the codeparse-plugins.xml/code file and returns the
-   * [EMAIL PROTECTED] ParsePluginList} defined by it.
+   * [EMAIL PROTECTED] #ParsePluginList} defined by it.
*
-   * @return A [EMAIL PROTECTED] ParsePluginList} specified by the
+   * @return A [EMAIL PROTECTED] #ParsePluginList} specified by the
* codeparse-plugins.xml/code file.
* @throws Exception
* If any parsing error occurs.

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java?rev=406053r1=406052r2=406053view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Sat 
May 13 02:14:53 2006
@@ -74,7 +74,7 @@
   /**
* Function returns an array of [EMAIL PROTECTED] Parser}s for a given 
content type.
*
-   * The function consults the internal [EMAIL PROTECTED] ParsePluginList} for 
the
+   * The function consults the internal list of parse plugins for the
* ParserFactory to determine the list of pluginIds, then gets the
* appropriate extension points to instantiate as [EMAIL PROTECTED] Parser}s.
*
@@ -149,12 +149,12 @@
* instantiate the Parser itself , then this function will cache that Parser
* in the internal codePARSER_CACHE/code.
* 
-   * @param extId The string extension ID (e.g.,
+   * @param id The string extension ID (e.g.,
*org.apache.nutch.parse.rss.RSSParser,
*org.apache.nutch.parse.rtf.RTFParseFactory) of the [EMAIL 
PROTECTED] Parser}
*implementation to return.
* @return A [EMAIL PROTECTED] Parser} implementation specified by the 
parameter
-   * codeextId/code.
+   * codeid/code.
* @throws ParserNotFound If the Parser is not found (i.e., registered with
* the extension point), or if the there a
* [EMAIL PROTECTED] PluginRuntimeException} instantiating the 
[EMAIL PROTECTED] Parser}.

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java?rev=406053r1=406052r2=406053view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch

svn commit: r405566 - /lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java

2006-05-09 Thread jerome
Author: jerome
Date: Tue May  9 16:06:17 2006
New Revision: 405566

URL: http://svn.apache.org/viewcvs?rev=405566view=rev
Log:
NUTCH-134 - No more needs for the clusterer to remove html tags from summaries

Modified:

lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java

Modified: 
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java?rev=405566r1=405565r2=405566view=diff
==
--- 
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java
 Tue May  9 16:06:17 2006
@@ -34,7 +34,6 @@
 import com.dawidweiss.carrot.core.local.ProcessingException;
 import com.dawidweiss.carrot.core.local.RequestContext;
 import com.dawidweiss.carrot.core.local.clustering.*;
-import com.dawidweiss.carrot.util.common.StringUtils;
 
 /**
  * A local input component that ignores the query passed from the
@@ -103,7 +102,7 @@
 // produce 'documents' for successor components.
 final RawDocumentsConsumer consumer = (RawDocumentsConsumer) next;
 for (int i=0;isummaries.length;i++) {
-  consumer.addDocument(new NutchDocument(i, details[i], 
htmlToText(summaries[i]), defaultLanguage));
+  consumer.addDocument(new NutchDocument(i, details[i], summaries[i], 
defaultLanguage));
 }
   }
 
@@ -121,14 +120,4 @@
 return SUCCESSOR_CAPABILITIES;
   }
 
-  /**
-   * Converts a html chunk to plain text.
-   * 
-   * This method is only required because Nutch's summaries are in HTML.
-   * I guess it would be possible to get rid of the code below by
-   * adding patches/ methods to Nutch that return plain text summaries. 
-   */
-  private final String htmlToText(String html) {
-return StringUtils.removeMarkup(html);
-  }
 }




svn commit: r405165 - in /lucene/nutch/trunk: ./ conf/ src/java/org/apache/nutch/searcher/ src/plugin/ src/plugin/nutch-extensionpoints/ src/plugin/summary-basic/ src/plugin/summary-basic/src/ src/plu

2006-05-08 Thread jerome
Author: jerome
Date: Mon May  8 14:04:01 2006
New Revision: 405165

URL: http://svn.apache.org/viewcvs?rev=405165view=rev
Log:
NUTCH-134 : Added a summarizer extension point and two enxtensions:
* summary-basic is the current nutch implementation moved into a plugin
* summary-lucene a raw version of a summarizer plugin based on lucene 
highlighter

Added:

lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SummarizerFactory.java   
(with props)
lucene/nutch/trunk/src/plugin/summary-basic/
lucene/nutch/trunk/src/plugin/summary-basic/build.xml   (with props)
lucene/nutch/trunk/src/plugin/summary-basic/plugin.xml   (with props)
lucene/nutch/trunk/src/plugin/summary-basic/src/
lucene/nutch/trunk/src/plugin/summary-basic/src/java/
lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/
lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/
lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/

lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/

lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/

lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java
   (with props)

lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/package.html
   (with props)
lucene/nutch/trunk/src/plugin/summary-lucene/
lucene/nutch/trunk/src/plugin/summary-lucene/build.xml   (with props)
lucene/nutch/trunk/src/plugin/summary-lucene/lib/

lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.0-rc1-dev.jar
   (with props)
lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml   (with props)
lucene/nutch/trunk/src/plugin/summary-lucene/src/
lucene/nutch/trunk/src/plugin/summary-lucene/src/java/
lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/
lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/
lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/

lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/

lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/

lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/LuceneSummarizer.java
   (with props)

lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/package.html
   (with props)
Modified:
lucene/nutch/trunk/build.xml
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/default.properties
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java
lucene/nutch/trunk/src/plugin/build.xml
lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=405165r1=405164r2=405165view=diff
==
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Mon May  8 14:04:01 2006
@@ -323,6 +323,8 @@
   packageset dir=${plugins.dir}/query-more/src/java/
   packageset dir=${plugins.dir}/query-site/src/java/
   packageset dir=${plugins.dir}/query-url/src/java/
+  packageset dir=${plugins.dir}/summary-basic/src/java/
+  packageset dir=${plugins.dir}/summary-lucene/src/java/
   packageset dir=${plugins.dir}/urlfilter-automaton/src/java/
   packageset dir=${plugins.dir}/urlfilter-regex/src/java/
   packageset dir=${plugins.dir}/urlfilter-prefix/src/java/
@@ -350,6 +352,7 @@
   group title=Analysis Plugins packages=${plugins.analysis}/
   group title=Indexing Filter Plugins packages=${plugins.index}/
   group title=Query Filter Plugins packages=${plugins.query}/
+  group title=Summary Plugins packages=${plugins.summary}/
   group title=Clustering Plugins packages=${plugins.clustering}/
   group title=Ontology Plugins packages=${plugins.ontology}/
   group title=Misc. Plugins packages=${plugins.misc}/

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=405165r1=405164r2=405165view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Mon May  8 14:04:01 2006
@@ -564,7 +564,7 @@
 
 property
   nameplugin.includes/name
-  
valueprotocol-http|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)/value
+  
valueprotocol-http|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)|summary-basic/value
   descriptionRegular expression naming plugin directory names to
   include.  Any plugin not matching this expression is excluded.
   In any

svn commit: r394228 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/plugin/ src/plugin/ src/plugin/analysis-de/ src/plugin/analysis-fr/ src/plugin/clustering-carrot2/ src/plugin/creativecommons

2006-04-14 Thread jerome
Author: jerome
Date: Fri Apr 14 16:57:24 2006
New Revision: 394228

URL: http://svn.apache.org/viewcvs?rev=394228view=rev
Log:
NUTCH-245 : Added a DTD for Nutch Plugin Manifest
  - Add a commented DTD in src
  - Add the DTD in javadoc
  - Change the implementation element structure : uses name-value parameters 
instead of proprietary attributes
  - Fix unit tests regarding changes in DTD
  - Fix the plugin.xml file in nutch plugins regarding changes in DTD

Added:
lucene/nutch/trunk/src/plugin/plugin.dtd   (with props)
Modified:
lucene/nutch/trunk/build.xml

lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/package.html
lucene/nutch/trunk/src/plugin/analysis-de/plugin.xml
lucene/nutch/trunk/src/plugin/analysis-fr/plugin.xml
lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml
lucene/nutch/trunk/src/plugin/creativecommons/plugin.xml
lucene/nutch/trunk/src/plugin/index-basic/plugin.xml
lucene/nutch/trunk/src/plugin/index-more/plugin.xml
lucene/nutch/trunk/src/plugin/languageidentifier/plugin.xml
lucene/nutch/trunk/src/plugin/lib-commons-httpclient/plugin.xml
lucene/nutch/trunk/src/plugin/lib-http/plugin.xml
lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml
lucene/nutch/trunk/src/plugin/lib-log4j/plugin.xml
lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml
lucene/nutch/trunk/src/plugin/lib-nekohtml/plugin.xml
lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml
lucene/nutch/trunk/src/plugin/lib-regex-filter/plugin.xml
lucene/nutch/trunk/src/plugin/lib-xml/plugin.xml
lucene/nutch/trunk/src/plugin/microformats-reltag/plugin.xml
lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml
lucene/nutch/trunk/src/plugin/ontology/plugin.xml
lucene/nutch/trunk/src/plugin/parse-ext/plugin.xml
lucene/nutch/trunk/src/plugin/parse-html/plugin.xml
lucene/nutch/trunk/src/plugin/parse-js/plugin.xml
lucene/nutch/trunk/src/plugin/parse-mp3/plugin.xml
lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml
lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml
lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml
lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml
lucene/nutch/trunk/src/plugin/parse-rtf/plugin.xml
lucene/nutch/trunk/src/plugin/parse-swf/plugin.xml
lucene/nutch/trunk/src/plugin/parse-text/plugin.xml
lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml
lucene/nutch/trunk/src/plugin/protocol-file/plugin.xml
lucene/nutch/trunk/src/plugin/protocol-ftp/plugin.xml
lucene/nutch/trunk/src/plugin/protocol-http/plugin.xml
lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml
lucene/nutch/trunk/src/plugin/query-basic/plugin.xml
lucene/nutch/trunk/src/plugin/query-more/plugin.xml
lucene/nutch/trunk/src/plugin/query-site/plugin.xml
lucene/nutch/trunk/src/plugin/query-url/plugin.xml
lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml
lucene/nutch/trunk/src/plugin/urlfilter-prefix/plugin.xml
lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml
lucene/nutch/trunk/src/test/org/apache/nutch/plugin/TestPluginSystem.java

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=394228r1=394227r2=394228view=diff
==
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Fri Apr 14 16:57:24 2006
@@ -279,6 +279,9 @@
   !-- Documentation  --
   !-- == --
   target name=javadoc depends=compile
+!-- Copy the plugin.dtd file to the plugin doc-files dir --
+copy file=${plugins.dir}/plugin.dtd
+  todir=${src.dir}/org/apache/nutch/plugin/doc-files/
 mkdir dir=${build.javadoc}/
 javadoc
   overview=${src.dir}/overview.html
@@ -353,6 +356,7 @@
   group title=Ontology Plugins packages=${plugins.ontology}/
   group title=Misc. Plugins packages=${plugins.misc}/
 /javadoc
+
   /target

   target name=default-doc

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java?rev=394228r1=394227r2=394228view=diff
==
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java 
(original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java 
Fri Apr 14 16:57:24 2006
@@ -36,6 +36,8 @@
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
+import org.xml.sax.EntityResolver

svn commit: r394231 - in /lucene/nutch/trunk: build.xml src/plugin/plugin.dtd

2006-04-14 Thread jerome
Author: jerome
Date: Fri Apr 14 17:13:21 2006
New Revision: 394231

URL: http://svn.apache.org/viewcvs?rev=394231view=rev
Log:
NUTCH-245 : Some minor fixes
  - Added Apache License in DTD (?)
  - Delete the org/apache/nutch/plugin/doc-files once javadoc task completed.

Modified:
lucene/nutch/trunk/build.xml
lucene/nutch/trunk/src/plugin/plugin.dtd

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=394231r1=394230r2=394231view=diff
==
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Fri Apr 14 17:13:21 2006
@@ -356,7 +356,8 @@
   group title=Ontology Plugins packages=${plugins.ontology}/
   group title=Misc. Plugins packages=${plugins.misc}/
 /javadoc
-
+!-- Clean the doc-files dir from src --
+delete dir=${src.dir}/org/apache/nutch/plugin/doc-files/
   /target

   target name=default-doc

Modified: lucene/nutch/trunk/src/plugin/plugin.dtd
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/plugin.dtd?rev=394231r1=394230r2=394231view=diff
==
--- lucene/nutch/trunk/src/plugin/plugin.dtd (original)
+++ lucene/nutch/trunk/src/plugin/plugin.dtd Fri Apr 14 17:13:21 2006
@@ -1,10 +1,25 @@
 ?xml version=1.0 encoding=UTF-8?
 
 !--
+ ! Copyright 2005 The Apache Software Foundation
+ !
+ ! Licensed under the Apache License, Version 2.0 (the License);
+ ! you may not use this file except in compliance with the License.
+ ! You may obtain a copy of the License at
+ !
+ ! http://www.apache.org/licenses/LICENSE-2.0
+ !
+ ! Unless required by applicable law or agreed to in writing, software
+ ! distributed under the License is distributed on an AS IS BASIS,
+ ! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ! See the License for the specific language governing permissions and
+ ! limitations under the License.
+ !
+ !
  !  Document   : plugin.dtd
  !  Created on : 14 avril 2006, 22:14
- !  Author : Jerome Charron
- !  Description: Nutch plug-in manifest
+ !  Author : Chris Mattmann, Jerome Charron
+ !  Description: Nutch plug-in manifest DTD
  !
  !  PUBLIC ID  : -//Apache Software Fundation//DTD Nutch Plugin Manifest 
1.0//EN
  !  SYSTEM ID  : http://lucene.apache.org/nutch/plugin.dtd




svn commit: r391958 - in /lucene/nutch/trunk: conf/nutch-default.xml src/java/org/apache/nutch/parse/ParseData.java src/test/org/apache/nutch/parse/TestParseData.java src/test/org/apache/nutch/util/Wr

2006-04-06 Thread jerome
Author: jerome
Date: Thu Apr  6 03:49:40 2006
New Revision: 391958

URL: http://svn.apache.org/viewcvs?rev=391958view=rev
Log:
NUTCH-244, db.max.outlinks.per.page can now be negative for no limit of handled 
outlinks per page

Modified:
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java
lucene/nutch/trunk/src/test/org/apache/nutch/util/WritableTestUtils.java

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=391958r1=391957r2=391958view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Thu Apr  6 03:49:40 2006
@@ -255,6 +255,8 @@
   namedb.max.outlinks.per.page/name
   value100/value
   descriptionThe maximum number of outlinks that we'll process for a page.
+  If this value is nonnegative (=0), at most db.max.outlinks.per.page outlinks
+  will be processed for a page; otherwise, all outlinks will be processed.
   /description
 /property
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java?rev=391958r1=391957r2=391958view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java Thu Apr  
6 03:49:40 2006
@@ -119,12 +119,15 @@
 
 int totalOutlinks = in.readInt(); // read outlinks
 int maxOutlinksPerPage = this.conf.getInt(db.max.outlinks.per.page, 100);
-int outlinksToRead = Math.min(maxOutlinksPerPage, totalOutlinks);
+int outlinksToRead = totalOutlinks;
+if (maxOutlinksPerPage = 0) {
+  outlinksToRead = Math.min(maxOutlinksPerPage, totalOutlinks);
+}
 outlinks = new Outlink[outlinksToRead];
 for (int i = 0; i  outlinksToRead; i++) {
   outlinks[i] = Outlink.read(in);
 }
-for (int i = maxOutlinksPerPage; i  totalOutlinks; i++) {
+for (int i = outlinksToRead; i  totalOutlinks; i++) {
   Outlink.skip(in);
 }
 

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java?rev=391958r1=391957r2=391958view=diff
==
--- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java 
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java Thu 
Apr  6 03:49:40 2006
@@ -51,4 +51,31 @@
 WritableTestUtils.testWritable(r, conf);
   }

+  public void testMaxOutlinks() throws Exception {
+Outlink[] outlinks = new Outlink[128];
+for (int i=0; ioutlinks.length; i++) {
+  outlinks[i] = new Outlink(http://outlink.com/; + i, Outlink + i, 
conf);
+}
+ParseData original = new ParseData(ParseStatus.STATUS_SUCCESS,
+   Max Outlinks Title,
+   outlinks,
+   new Metadata());
+Configuration conf = NutchConfiguration.create();
+// No Outlinks
+conf.setInt(db.max.outlinks.per.page, 0);
+ParseData data = (ParseData) WritableTestUtils.writeRead(original, conf);
+assertEquals(0, data.getOutlinks().length);
+// Only 100 Outlinks
+conf.setInt(db.max.outlinks.per.page, 100);
+data = (ParseData) WritableTestUtils.writeRead(original, conf);
+assertEquals(100, data.getOutlinks().length);
+// 256 Outlinks
+conf.setInt(db.max.outlinks.per.page, 256);
+data = (ParseData) WritableTestUtils.writeRead(original, conf);
+assertEquals(outlinks.length, data.getOutlinks().length);
+// All Outlinks
+conf.setInt(db.max.outlinks.per.page, -1);
+data = (ParseData) WritableTestUtils.writeRead(original, conf);
+assertEquals(outlinks.length, data.getOutlinks().length);
+  }
 }

Modified: 
lucene/nutch/trunk/src/test/org/apache/nutch/util/WritableTestUtils.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/util/WritableTestUtils.java?rev=391958r1=391957r2=391958view=diff
==
--- lucene/nutch/trunk/src/test/org/apache/nutch/util/WritableTestUtils.java 
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/util/WritableTestUtils.java 
Thu Apr  6 03:49:40 2006
@@ -31,6 +31,14 @@
   /** Utility method for testing writables. */
   public static void testWritable(Writable before, Configuration conf)
   throws Exception {
+TestCase.assertEquals(before, writeRead(before, conf

svn commit: r391150 - /lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java

2006-04-03 Thread jerome
Author: jerome
Date: Mon Apr  3 13:57:46 2006
New Revision: 391150

URL: http://svn.apache.org/viewcvs?rev=391150view=rev
Log:
no more dump parse-mspowerpoint unit test result to a file for visual checks

Modified:

lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java

Modified: 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java?rev=391150r1=391149r2=391150view=diff
==
--- 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
 Mon Apr  3 13:57:46 2006
@@ -70,7 +70,7 @@
   /**
* Wether dumping the extracted data to file for visual checks.
*/
-  private final static boolean DUMP_TO_FILE = true;
+  private final static boolean DUMP_TO_FILE = false;
 
   private final File testFile;
 




svn commit: r390392 - in /lucene/nutch/trunk/src/java/org/apache/nutch: analysis/ clustering/ indexer/ net/ ontology/ parse/ plugin/ protocol/ searcher/

2006-03-31 Thread jerome
Author: jerome
Date: Fri Mar 31 03:04:43 2006
New Revision: 390392

URL: http://svn.apache.org/viewcvs?rev=390392view=rev
Log:
Add a common Pluggable interface to all Extension Points and a package 
description for plugin

Added:
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/Pluggable.java   (with 
props)
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/package.html   (with 
props)
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilter.java
lucene/nutch/trunk/src/java/org/apache/nutch/ontology/Ontology.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/Parser.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilter.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java?rev=390392r1=390391r2=390392view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java 
Fri Mar 31 03:04:43 2006
@@ -22,6 +22,9 @@
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 
+// Nutch imports
+import org.apache.nutch.plugin.Pluggable;
+
 
 /** 
  * Extension point for analysis.
@@ -30,7 +33,8 @@
  *
  * @author Jeacute;rocirc;me Charron
  */
-public abstract class NutchAnalyzer extends Analyzer {
+public abstract class NutchAnalyzer extends Analyzer
+implements Pluggable {
 
   /** The name of the extension point. */
   final static String X_POINT_ID = NutchAnalyzer.class.getName();

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java?rev=390392r1=390391r2=390392view=diff
==
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java 
(original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java 
Fri Mar 31 03:04:43 2006
@@ -16,8 +16,11 @@
 
 package org.apache.nutch.clustering;
 
+// Nutch imports
+import org.apache.nutch.plugin.Pluggable;
 import org.apache.nutch.searcher.HitDetails;
 
+
 /**
  * An extension point interface for online search results clustering
  * algorithms.
@@ -33,7 +36,7 @@
  * @author Dawid Weiss
  * @version $Id: OnlineClusterer.java,v 1.1 2004/08/09 23:23:52 johnnx Exp $
  */
-public interface OnlineClusterer {
+public interface OnlineClusterer extends Pluggable {
   /** The name of the extension point. */
   public final static String X_POINT_ID = OnlineClusterer.class.getName();
 

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java?rev=390392r1=390391r2=390392view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java 
Fri Mar 31 03:04:43 2006
@@ -16,18 +16,25 @@
 
 package org.apache.nutch.indexer;
 
+// Lucene imports
 import org.apache.lucene.document.Document;
-import org.apache.nutch.parse.Parse;
+
+// Hadoop imports
 import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.io.UTF8;
+
+// Nutch imports
+import org.apache.nutch.parse.Parse;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
-import org.apache.hadoop.io.UTF8;
+import org.apache.nutch.plugin.Pluggable;
+
 
 /** Extension point for indexing.  Permits one to add metadata to the indexed
  * fields.  All plugins found which implement this extension point are run
  * sequentially on the parse.
  */
-public interface IndexingFilter extends Configurable {
+public interface IndexingFilter extends Pluggable, Configurable {
   /** The name of the extension point. */
   final static String X_POINT_ID = IndexingFilter.class.getName();
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilter.java?rev=390392r1=390391r2=390392view=diff
==
--- lucene/nutch/trunk/src

svn commit: r389729 - in /lucene/nutch/trunk/src/plugin/parse-rss: build.xml lib/jaxen-core.jar lib/jaxen-jdom.jar lib/jdom.jar lib/saxpath.jar lib/xercesImpl.jar lib/xml-apis.jar plugin.xml

2006-03-29 Thread jerome
Author: jerome
Date: Wed Mar 29 01:45:01 2006
New Revision: 389729

URL: http://svn.apache.org/viewcvs?rev=389729view=rev
Log:
parse-rss now depends on new lib-xml plugin

Removed:
lucene/nutch/trunk/src/plugin/parse-rss/lib/jaxen-core.jar
lucene/nutch/trunk/src/plugin/parse-rss/lib/jaxen-jdom.jar
lucene/nutch/trunk/src/plugin/parse-rss/lib/jdom.jar
lucene/nutch/trunk/src/plugin/parse-rss/lib/saxpath.jar
lucene/nutch/trunk/src/plugin/parse-rss/lib/xercesImpl.jar
lucene/nutch/trunk/src/plugin/parse-rss/lib/xml-apis.jar
Modified:
lucene/nutch/trunk/src/plugin/parse-rss/build.xml
lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml

Modified: lucene/nutch/trunk/src/plugin/parse-rss/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/build.xml?rev=389729r1=389728r2=389729view=diff
==
--- lucene/nutch/trunk/src/plugin/parse-rss/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-rss/build.xml Wed Mar 29 01:45:01 2006
@@ -6,18 +6,21 @@
 
  !-- Build compilation dependencies --
  target name=deps-jar
+   ant target=jar inheritall=false dir=../lib-xml/
ant target=jar inheritall=false dir=../lib-log4j/
  /target
 
  !-- Add compilation dependencies to classpath --
  path id=plugin.deps
fileset dir=${nutch.root}/build
+ include name=**/lib-xml/*.jar /
  include name=**/lib-log4j/*.jar /
/fileset
  /path
 
  !-- Deploy Unit test dependencies --
  target name=deps-test
+   ant target=deploy inheritall=false dir=../lib-xml/
ant target=deploy inheritall=false dir=../lib-log4j/
ant target=deploy inheritall=false dir=../nutch-extensionpoints/
ant target=deploy inheritall=false dir=../lib-commons-httpclient/

Modified: lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml?rev=389729r1=389728r2=389729view=diff
==
--- lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml Wed Mar 29 01:45:01 2006
@@ -10,19 +10,13 @@
   library name=parse-rss.jar
  export name=*/
   /library
-  library name=jdom.jar/
-  library name=jaxen-core.jar/
-  library name=jaxen-jdom.jar/
   library name=commons-feedparser-0.6-fork.jar/
-  library name=saxpath.jar/
-  library name=log4j-1.2.6.jar/
-  library name=xercesImpl.jar/
-  library name=xml-apis.jar/
   library name=xml-rpc-1.2.jar/
/runtime
 
requires
   import plugin=nutch-extensionpoints/
+  import plugin=lib-xml/
   import plugin=lib-log4j/
   import plugin=lib-commons-httpclient/
/requires




svn commit: r389456 - /lucene/nutch/trunk/build.xml

2006-03-28 Thread jerome
Author: jerome
Date: Tue Mar 28 01:40:29 2006
New Revision: 389456

URL: http://svn.apache.org/viewcvs?rev=389456view=rev
Log:
NUTCH-210, forgot to commit the nutch.xml xsl generation

Modified:
lucene/nutch/trunk/build.xml

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=389456r1=389455r2=389456view=diff
==
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Tue Mar 28 01:40:29 2006
@@ -144,6 +144,14 @@
   !----
   !-- == --
   target name=war depends=jar,compile,generate-docs
+
+!-- generate the nutch.xml (servlet context) file --
+xslt in=${basedir}/conf/nutch-default.xml
+  out=${build.dir}/nutch.xml
+  style=${basedir}/conf/context.xsl
+xmlcatalog refid=docDTDs/
+   outputproperty name=indent value=yes/
+/xslt
 war destfile=${build.dir}/${final.name}.war
 webxml=${web.src.dir}/web.xml
   fileset dir=${web.src.dir}/jsp/




svn commit: r389514 - /lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java

2006-03-28 Thread jerome
Author: jerome
Date: Tue Mar 28 06:56:21 2006
New Revision: 389514

URL: http://svn.apache.org/viewcvs?rev=389514view=rev
Log:
NUTCH-210, make use of nutch.xml servlet context in opensearch servlet too

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java?rev=389514r1=389513r2=389514view=diff
==
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java 
(original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java 
Tue Mar 28 06:56:21 2006
@@ -62,7 +62,7 @@
 
   public void init(ServletConfig config) throws ServletException {
 try {
-  this.conf = NutchConfiguration.create();
+  this.conf = NutchConfiguration.get(config.getServletContext());
   bean = NutchBean.get(config.getServletContext(), this.conf);
 } catch (IOException e) {
   throw new ServletException(e);




svn commit: r389160 - in /lucene/nutch/trunk: conf/ src/java/org/apache/nutch/util/ src/web/jsp/

2006-03-27 Thread jerome
Author: jerome
Date: Mon Mar 27 06:52:14 2006
New Revision: 389160

URL: http://svn.apache.org/viewcvs?rev=389160view=rev
Log:
NUTCH-210, Add an xsl that generates a basic ServletContext XML file for the 
nutch webapp and make use of the
ServletContext init parameters to override the properties in nutch-default and 
nutch-site properties.
Contributed by Chris Mattmann.

Added:
lucene/nutch/trunk/conf/context.xsl   (with props)
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java
lucene/nutch/trunk/src/web/jsp/anchors.jsp
lucene/nutch/trunk/src/web/jsp/cached.jsp
lucene/nutch/trunk/src/web/jsp/explain.jsp
lucene/nutch/trunk/src/web/jsp/refine-query-init.jsp
lucene/nutch/trunk/src/web/jsp/search.jsp
lucene/nutch/trunk/src/web/jsp/text.jsp

Added: lucene/nutch/trunk/conf/context.xsl
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/context.xsl?rev=389160view=auto
==
--- lucene/nutch/trunk/conf/context.xsl (added)
+++ lucene/nutch/trunk/conf/context.xsl Mon Mar 27 06:52:14 2006
@@ -0,0 +1,87 @@
+?xml version=1.0 encoding=ISO-8859-1?
+!--
+   Copyright 2006 The Apache Software Foundation
+   
+   Licensed under the Apache License, Version 2.0 (the License);
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   
+   http://www.apache.org/licenses/LICENSE-2.0
+   
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an AS IS BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+   
+   Author : Chris Mattmann
+Auhtor : Jérôme Charron
+   Description: This xsl file is used to transform dynamic properties out 
of the
+   nutch-default.xml file into a deployable Context.xml file for 
configuring
+   the Nutch war file in a servlet container.
+--
+
+xsl:stylesheet version=1.0 xmlns:xsl=http://www.w3.org/1999/XSL/Transform;
+
+xsl:template match=/
+Context path=/nutch docBase=nutch-0.8-dev.war
+debug=5 reloadable=true crossContext=true
+
+xsl:for-each select=configuration/property
+
+!-- searcher. properties --
+xsl:call-template name=parameter
+  xsl:with-param name=property select=current()/
+  xsl:with-param name=filter select='searcher.'/
+/xsl:call-template
+
+!-- plugin. properties --
+xsl:call-template name=parameter
+  xsl:with-param name=property select=current()/
+  xsl:with-param name=filter select='plugin.'/
+/xsl:call-template
+
+!-- extension.clustering. properties --
+xsl:call-template name=parameter
+  xsl:with-param name=property select=current()/
+  xsl:with-param name=filter select='extension.clustering.'/
+/xsl:call-template
+
+!-- extension.ontology. properties --
+xsl:call-template name=parameter
+  xsl:with-param name=property select=current()/
+  xsl:with-param name=filter select='extension.ontology.'/
+/xsl:call-template
+
+!-- query. properties --
+xsl:call-template name=parameter
+  xsl:with-param name=property select=current()/
+  xsl:with-param name=filter select='query.'/
+/xsl:call-template
+
+/xsl:for-each
+
+/Context
+/xsl:template
+
+
+!--
+ ! Template used to write out a parameter if the property's
+ ! name contains the specified filter string.
+ !--
+xsl:template name=parameter
+  xsl:param name=property/
+  xsl:param name=filter/
+  xsl:if test=contains(name, $filter)
+  Parameter override=false
+xsl:attribute name=name
+  xsl:value-of select=name/
+/xsl:attribute
+xsl:attribute name=value
+  xsl:value-of select=value/
+/xsl:attribute
+  /Parameter
+  /xsl:if
+/xsl:template
+
+/xsl:stylesheet 

Propchange: lucene/nutch/trunk/conf/context.xsl
--
svn:eol-style = native

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java?rev=389160r1=389159r2=389160view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java 
Mon Mar 27 06:52:14 2006
@@ -16,12 +16,23 @@
 
 package org.apache.nutch.util;
 
+// JDK imports
+import java.util.Enumeration;
+
+// Servlet imports
+import javax.servlet.ServletContext;
+
+// Hadoop imports
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.WritableName;
 
+
 /** Utility to create Hadoop [EMAIL PROTECTED] Configuration}s that include 
Nutch-specific
  * resources.  */
 public class

svn commit: r388293 - /lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java

2006-03-23 Thread jerome
Author: jerome
Date: Thu Mar 23 15:21:03 2006
New Revision: 388293

URL: http://svn.apache.org/viewcvs?rev=388293view=rev
Log:
Set the configuration of the parser used in the main method to fix NPEs

Modified:

lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java

Modified: 
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=388293r1=388292r2=388293view=diff
==
--- 
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
 Thu Mar 23 15:21:03 2006
@@ -269,9 +269,11 @@
 byte[] bytes = new byte[(int)file.length()];
 DataInputStream in = new DataInputStream(new FileInputStream(file));
 in.readFully(bytes);
-Parse parse = new HtmlParser().getParse(
-new Content(url, url, bytes, text/html, new Metadata(),
-NutchConfiguration.create()));
+Configuration conf = NutchConfiguration.create();
+HtmlParser parser = new HtmlParser();
+parser.setConf(conf);
+Parse parse = parser.getParse(
+new Content(url, url, bytes, text/html, new Metadata(), conf));
 System.out.println(data: +parse.getData());
 
 System.out.println(text: +parse.getText());




svn commit: r387650 - in /lucene/nutch/trunk/src/plugin/urlfilter-regex: ./ sample/ src/java/org/apache/nutch/net/ src/test/ src/test/org/ src/test/org/apache/ src/test/org/apache/nutch/ src/test/org/

2006-03-21 Thread jerome
Author: jerome
Date: Tue Mar 21 14:26:56 2006
New Revision: 387650

URL: http://svn.apache.org/viewcvs?rev=387650view=rev
Log:
urlfilter-regex now use the lib-regex-filter framework.
Add some unit tests.

Added:
lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/
lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.rules
lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.urls
lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules
lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.urls
lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules
lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.urls
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/net/

lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/net/TestRegexURLFilter.java
   (with props)
Modified:
lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml

lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java

Modified: lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml?rev=387650r1=387649r2=387650view=diff
==
--- lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml Tue Mar 21 14:26:56 
2006
@@ -4,4 +4,29 @@
 
   import file=../build-plugin.xml/
 
+  !-- Build compilation dependencies --
+  target name=deps-jar
+ant target=jar inheritall=false dir=../lib-regex-filter/
+ant target=compile-test inheritall=false dir=../lib-regex-filter/
+  /target
+
+  !-- Add compilation dependencies to classpath --
+  path id=plugin.deps
+fileset dir=${nutch.root}/build
+  include name=**/lib-regex-filter/*.jar /
+/fileset
+pathelement location=${nutch.root}/build/lib-regex-filter/test/
+  /path
+
+  !-- Deploy Unit test dependencies --
+  target name=deps-test
+ant target=deploy inheritall=false dir=../lib-regex-filter/
+  /target
+
+  !-- for junit test --
+  mkdir dir=${build.test}/data/
+  copy todir=${build.test}/data
+fileset dir=sample includes=**/*.rules, **/*.urls/
+  /copy
+
 /project

Added: lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.rules
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.rules?rev=387650view=auto
==
--- lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.rules 
(added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.rules Tue 
Mar 21 14:26:56 2006
@@ -0,0 +1,26 @@
+# The url filter file used by the crawl command.
+
+# Better for intranet crawling.
+# Be sure to change MY.DOMAIN.NAME to your domain name.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+# skip file:, ftp:,  mailto: urls
+-^(file|ftp|mailto):
+
+# skip image and other suffixes we can't yet parse
+-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$
+
+# skip URLs containing certain characters as probable queries, etc.
[EMAIL PROTECTED]
+
+# skip .fr .org and .net domains
+-^.*//.*\.fr/
+-^.*//.*\.org/
+-^.*//.*\.net/
+
+# skip everything else
++.

Added: lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.urls
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.urls?rev=387650view=auto
==
--- lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.urls (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.urls Tue 
Mar 21 14:26:56 2006
@@ -0,0 +1,297 @@
++http://www.hostip.info/
+-http://www.elanceur.org/Articles/OntologieSurfaite.html
++http://www.opensymphony.com/quartz/
+-http://www.portletbridge.org/saxbenchmark/index.html
++http://www.lesmotsdelinfo.com/
++http://usefulinc.com/doap/
++http://www.codezoo.com/
++http://search.infocious.com/
+-http://pedagogie.ac-montpellier.fr/disciplines/anglais/tice/sms.html
++http://www.brics.dk/%7Eamoeller/automaton/
++http://jazzz.com/wp.html
++http://www.maxkiesler.com/index.php
++http://adscriptum.blogspot.com/2006/03/google-et-la-prsentation-deric-schmidt.html
++http://www.alias-i.com/lingpipe/
+-http

svn commit: r387651 - in /lucene/nutch/trunk/src/plugin/urlfilter-automaton: ./ lib/ sample/ src/ src/java/ src/java/org/ src/java/org/apache/ src/java/org/apache/nutch/ src/java/org/apache/nutch/net/

2006-03-21 Thread jerome
Author: jerome
Date: Tue Mar 21 14:29:18 2006
New Revision: 387651

URL: http://svn.apache.org/viewcvs?rev=387651view=rev
Log:
Add an urlfilter based on dk.brics.automaton.

Added:
lucene/nutch/trunk/src/plugin/urlfilter-automaton/
lucene/nutch/trunk/src/plugin/urlfilter-automaton/build.xml   (with props)
lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/
lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.jar   (with 
props)
lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml   (with props)
lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/
lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/Benchmarks.rules
lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/Benchmarks.urls

lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/IntranetCrawling.rules

lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/IntranetCrawling.urls

lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.rules

lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.urls
lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/
lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/
lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/
lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/
lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/

lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/net/

lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/net/AutomatonURLFilter.java
   (with props)

lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/net/package.html
   (with props)
lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/
lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/
lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/
lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/

lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/net/

lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/net/TestAutomatonURLFilter.java
   (with props)

Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/build.xml?rev=387651view=auto
==
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/build.xml Tue Mar 21 
14:29:18 2006
@@ -0,0 +1,32 @@
+?xml version=1.0?
+
+project name=urlfilter-automaton default=jar-core
+
+  import file=../build-plugin.xml/
+
+  !-- Build compilation dependencies --
+  target name=deps-jar
+ant target=jar inheritall=false dir=../lib-regex-filter/
+ant target=compile-test inheritall=false dir=../lib-regex-filter/
+  /target
+
+  !-- Add compilation dependencies to classpath --
+  path id=plugin.deps
+fileset dir=${nutch.root}/build
+  include name=**/lib-regex-filter/*.jar /
+/fileset
+pathelement location=${nutch.root}/build/lib-regex-filter/test/
+  /path
+
+  !-- Deploy Unit test dependencies --
+  target name=deps-test
+ant target=deploy inheritall=false dir=../lib-regex-filter/
+  /target
+
+  !-- for junit test --
+  mkdir dir=${build.test}/data/
+  copy todir=${build.test}/data
+fileset dir=sample includes=**/*.rules, **/*.urls/
+  /copy
+
+/project

Propchange: lucene/nutch/trunk/src/plugin/urlfilter-automaton/build.xml
--
svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.jar?rev=387651view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml?rev=387651view=auto
==
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml Tue Mar 21 
14:29:18 2006
@@ -0,0 +1,25 @@
+?xml version=1.0 encoding=UTF-8?
+plugin
+   id=urlfilter-automaton
+   name=Automaton URL Filter
+   version=1.0.0
+   provider-name=nutch.org
+
+   runtime
+  library name=urlfilter-automaton.jar
+ export name=*/
+  /library

svn commit: r387659 - in /lucene/nutch/trunk/src/plugin: urlfilter-automaton/plugin.xml urlfilter-regex/plugin.xml

2006-03-21 Thread jerome
Author: jerome
Date: Tue Mar 21 14:49:54 2006
New Revision: 387659

URL: http://svn.apache.org/viewcvs?rev=387659view=rev
Log:
Add some missing runtime dependencies

Modified:
lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml
lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml

Modified: lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml?rev=387659r1=387658r2=387659view=diff
==
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml Tue Mar 21 
14:49:54 2006
@@ -9,10 +9,12 @@
   library name=urlfilter-automaton.jar
  export name=*/
   /library
+  library name=automaton.jar/
/runtime
 
requires
   import plugin=nutch-extensionpoints/
+  import plugin=lib-regex-filter/
/requires
 
extension id=org.apache.nutch.net.urlfilter

Modified: lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml?rev=387659r1=387658r2=387659view=diff
==
--- lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml Tue Mar 21 
14:49:54 2006
@@ -13,6 +13,7 @@
 
requires
   import plugin=nutch-extensionpoints/
+  import plugin=lib-regex-filter/
/requires
 
extension id=org.apache.nutch.net.urlfilter




svn commit: r385702 - /lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java

2006-03-13 Thread jerome
Author: jerome
Date: Mon Mar 13 16:00:38 2006
New Revision: 385702

URL: http://svn.apache.org/viewcvs?rev=385702view=rev
Log:
Fix NPE if lang is null

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java?rev=385702r1=385701r2=385702view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java 
Mon Mar 13 16:00:38 2006
@@ -78,6 +78,7 @@
 
   private Extension getExtension(String lang) {
 
+if (lang == null) { return null; }
 Extension extension = (Extension) this.conf.getObject(lang);
 if (extension == null) {
   extension = findExtension(lang);




svn commit: r385267 - /lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml

2006-03-12 Thread jerome
Author: jerome
Date: Sun Mar 12 01:41:13 2006
New Revision: 385267

URL: http://svn.apache.org/viewcvs?rev=385267view=rev
Log:
NUTCH-228, clustering plugin descriptor fixed (Dawid Weiss)

Modified:
lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml?rev=385267r1=385266r2=385267view=diff
==
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml Sun Mar 12 
01:41:13 2006
@@ -18,7 +18,7 @@
   library name=carrot2-util-tokenizer.jar/
 
   library name=colt-1.0.3.jar/
-  library name=commons-collections-3.0.jar/
+  library name=commons-collections-3.1-patched.jar/
   library name=commons-pool-1.1.jar/
   library name=FSA.jar/
   library name=Jama-1.0.1-patched.jar/




svn commit: r385268 - /lucene/nutch/trunk/src/web/jsp/cluster.jsp

2006-03-12 Thread jerome
Author: jerome
Date: Sun Mar 12 01:42:44 2006
New Revision: 385268

URL: http://svn.apache.org/viewcvs?rev=385268view=rev
Log:
Fix milliseconds dumped by cluster.jsp (Dawid Weiss)

Modified:
lucene/nutch/trunk/src/web/jsp/cluster.jsp

Modified: lucene/nutch/trunk/src/web/jsp/cluster.jsp
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/web/jsp/cluster.jsp?rev=385268r1=385267r2=385268view=diff
==
--- lucene/nutch/trunk/src/web/jsp/cluster.jsp (original)
+++ lucene/nutch/trunk/src/web/jsp/cluster.jsp Sun Mar 12 01:42:44 2006
@@ -21,11 +21,11 @@
 // cluster the hits
 HitsCluster [] clusters = null;
 if (clusterer != null) {
-  long clusteringStart = System.currentTimeMillis();
+  final long clusteringStart = System.currentTimeMillis();
   try {
 clusters = clusterer.clusterHits( details, summaries );
-   long clusteringDuration = start - System.currentTimeMillis();
-   bean.LOG.info(Clustering took:  + clusteringDuration +  
milliseconds.);
+final long clusteringDuration = System.currentTimeMillis() - 
clusteringStart;
+bean.LOG.info(Clustering took:  + clusteringDuration +  milliseconds.);
   } catch (Exception e) {
 // failed to do clustering (see below)
   }




svn commit: r382981 - in /lucene/nutch/trunk/src/plugin: ./ analysis-de/ analysis-fr/ clustering-carrot2/ creativecommons/ index-basic/ index-more/ languageidentifier/ lib-http/ lib-parsems/ microform

2006-03-03 Thread jerome
Author: jerome
Date: Fri Mar  3 16:26:54 2006
New Revision: 382981

URL: http://svn.apache.org/viewcvs?rev=382981view=rev
Log:
Plugins now assumes that the core is already builded when building nutch

Modified:
lucene/nutch/trunk/src/plugin/analysis-de/build.xml
lucene/nutch/trunk/src/plugin/analysis-fr/build.xml
lucene/nutch/trunk/src/plugin/build-plugin.xml
lucene/nutch/trunk/src/plugin/build.xml
lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml
lucene/nutch/trunk/src/plugin/creativecommons/build.xml
lucene/nutch/trunk/src/plugin/index-basic/build.xml
lucene/nutch/trunk/src/plugin/index-more/build.xml
lucene/nutch/trunk/src/plugin/languageidentifier/build.xml
lucene/nutch/trunk/src/plugin/lib-http/build.xml
lucene/nutch/trunk/src/plugin/lib-parsems/build.xml
lucene/nutch/trunk/src/plugin/microformats-reltag/build.xml
lucene/nutch/trunk/src/plugin/ontology/build.xml
lucene/nutch/trunk/src/plugin/parse-ext/build.xml
lucene/nutch/trunk/src/plugin/parse-html/build.xml
lucene/nutch/trunk/src/plugin/parse-js/build.xml
lucene/nutch/trunk/src/plugin/parse-mp3/build.xml
lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml
lucene/nutch/trunk/src/plugin/parse-msword/build.xml
lucene/nutch/trunk/src/plugin/parse-pdf/build.xml
lucene/nutch/trunk/src/plugin/parse-rss/build.xml
lucene/nutch/trunk/src/plugin/parse-swf/build.xml
lucene/nutch/trunk/src/plugin/parse-text/build.xml
lucene/nutch/trunk/src/plugin/parse-zip/build.xml
lucene/nutch/trunk/src/plugin/protocol-file/build.xml
lucene/nutch/trunk/src/plugin/protocol-ftp/build.xml
lucene/nutch/trunk/src/plugin/protocol-http/build.xml
lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml
lucene/nutch/trunk/src/plugin/query-basic/build.xml
lucene/nutch/trunk/src/plugin/query-more/build.xml
lucene/nutch/trunk/src/plugin/query-site/build.xml
lucene/nutch/trunk/src/plugin/query-url/build.xml
lucene/nutch/trunk/src/plugin/urlfilter-prefix/build.xml
lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml

Modified: lucene/nutch/trunk/src/plugin/analysis-de/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/analysis-de/build.xml?rev=382981r1=382980r2=382981view=diff
==
--- lucene/nutch/trunk/src/plugin/analysis-de/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/analysis-de/build.xml Fri Mar  3 16:26:54 2006
@@ -1,12 +1,10 @@
 ?xml version=1.0?
 
-project name=analysis-de default=jar
+project name=analysis-de default=jar-core
 
   import file=../build-plugin.xml/
-
-  !-- Build compilation dependencies --
+  
   target name=deps-jar
-ant target=compile-core inheritall=false dir=${nutch.root}/
 ant target=jar inheritall=false dir=../lib-lucene-analyzers/
   /target
 

Modified: lucene/nutch/trunk/src/plugin/analysis-fr/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/analysis-fr/build.xml?rev=382981r1=382980r2=382981view=diff
==
--- lucene/nutch/trunk/src/plugin/analysis-fr/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/analysis-fr/build.xml Fri Mar  3 16:26:54 2006
@@ -1,12 +1,11 @@
 ?xml version=1.0?
 
-project name=analysis-fr default=jar
+project name=analysis-fr default=jar-core
 
   import file=../build-plugin.xml/
 
   !-- Build compilation dependencies --
   target name=deps-jar
-ant target=compile-core inheritall=false dir=${nutch.root}/
 ant target=jar inheritall=false dir=../lib-lucene-analyzers/
   /target
 

Modified: lucene/nutch/trunk/src/plugin/build-plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build-plugin.xml?rev=382981r1=382980r2=382981view=diff
==
--- lucene/nutch/trunk/src/plugin/build-plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/build-plugin.xml Fri Mar  3 16:26:54 2006
@@ -96,9 +96,14 @@
  source=${javac.version}
  deprecation=${javac.deprecation}
   classpath refid=classpath/
-/javac
+/javac
   /target
 
+  target name=compile-core
+ant target=compile-core inheritall=false dir=${nutch.root}/
+ant target=compile/
+  /target
+  
   !-- == --
   !-- Make plugin .jar   --
   !-- == --
@@ -109,6 +114,13 @@
   jarfile=${build.dir}/${name}.jar
   basedir=${build.classes}
 /
+  /target
+
+  target name=jar-core depends=compile-core
+jar
+jarfile=${build.dir}/${name}.jar
+basedir=${build.classes}
+/
   /target

svn commit: r382535 - in /lucene/nutch/trunk: conf/nutch-default.xml src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java src/plugin/protocol-ftp/src/java/org/apache/nutc

2006-03-02 Thread jerome
Author: jerome
Date: Thu Mar  2 14:38:40 2006
New Revision: 382535

URL: http://svn.apache.org/viewcvs?rev=382535view=rev
Log:
Fix content.limit inconsistency in http, ftp and file

Modified:
lucene/nutch/trunk/conf/nutch-default.xml

lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java

lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=382535r1=382534r2=382535view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Thu Mar  2 14:38:40 2006
@@ -13,8 +13,8 @@
   namefile.content.limit/name
   value65536/value
   descriptionThe length limit for downloaded content, in bytes.
-  If this value is larger than zero, content longer than it will be
-  truncated; otherwise (zero or negative), no truncation at all.
+  If this value is nonnegative (=0), content longer than it will be truncated;
+  otherwise, no truncation at all.
   /description
 /property
 
@@ -150,11 +150,11 @@
   nameftp.content.limit/name
   value65536/value 
   descriptionThe length limit for downloaded content, in bytes.
-  If this value is larger than zero, content longer than it is truncated;
-  otherwise (zero or negative), no truncation at all. Caution: classical
-  ftp RFCs never defines partial transfer and, in fact, some ftp servers
-  out there do not handle client side forced close-down very well.
-  Our implementation tries its best to handle such situations smoothly.
+  If this value is nonnegative (=0), content longer than it will be truncated;
+  otherwise, no truncation at all.
+  Caution: classical ftp RFCs never defines partial transfer and, in fact,
+  some ftp servers out there do not handle client side forced close-down very
+  well. Our implementation tries its best to handle such situations smoothly.
   /description
 /property
 

Modified: 
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=382535r1=382534r2=382535view=diff
==
--- 
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 Thu Mar  2 14:38:40 2006
@@ -167,7 +167,7 @@
 // capture content
 int len = (int) size;
 
-if (this.file.maxContentLength  0  len  this.file.maxContentLength)
+if (this.file.maxContentLength = 0  len  this.file.maxContentLength)
   len = this.file.maxContentLength;
 
 this.content = new byte[len];

Modified: 
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java?rev=382535r1=382534r2=382535view=diff
==
--- 
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
 Thu Mar  2 14:38:40 2006
@@ -344,9 +344,9 @@
 }
 entries.add(ftpFile);
 count += line.length();
-// impose download limit if limit  0, otherwise no limit
+// impose download limit if limit = 0, otherwise no limit
 // here, cut off is up to the line when total bytes is just over limit
-if (limit  0  count  limit) {
+if (limit = 0  count  limit) {
   mandatory_close = true;
   break;
 }
@@ -409,9 +409,9 @@
 new byte[org.apache.commons.net.io.Util.DEFAULT_COPY_BUFFER_SIZE];
   while((len=input.read(buf,0,buf.length)) != -1){
 count += len;
-// impose download limit if limit  0, otherwise no limit
+// impose download limit if limit = 0, otherwise no limit
 // here, cut off is exactly of limit bytes
-if (limit  0  count  limit) {
+if (limit = 0  count  limit) {
   os.write(buf,0,len-(count-limit));
   mandatory_close = true;
   break;




svn commit: r379403 - in /lucene/nutch/trunk: conf/ src/java/org/apache/nutch/parse/ src/plugin/creativecommons/src/test/org/creativecommons/nutch/ src/plugin/languageidentifier/src/java/org/apache/nu

2006-02-21 Thread jerome
Author: jerome
Date: Tue Feb 21 01:54:21 2006
New Revision: 379403

URL: http://svn.apache.org/viewcvs?rev=379403view=rev
Log:
NUTCH-140, parse-plugin.xml can now use extension-id and plugin-id

Modified:
lucene/nutch/trunk/conf/parse-plugins.dtd
lucene/nutch/trunk/conf/parse-plugins.xml
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java

lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java

lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java

lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java

lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java

lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java

lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java

lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java

lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java

lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java

lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java

lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java

lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml

Modified: lucene/nutch/trunk/conf/parse-plugins.dtd
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.dtd?rev=379403r1=379402r2=379403view=diff
==
--- lucene/nutch/trunk/conf/parse-plugins.dtd (original)
+++ lucene/nutch/trunk/conf/parse-plugins.dtd Tue Feb 21 01:54:21 2006
@@ -1,7 +1,12 @@
-!ELEMENT parse-plugins (mimeType+)
+!ELEMENT parse-plugins  (mimeType+,aliases)
 !ELEMENT mimeType (plugin+)
 !ATTLIST mimeType name CDATA #REQUIRED
 
 !ELEMENT plugin EMPTY
 !ATTLIST plugin id CDATA #REQUIRED
-!ATTLIST plugin order CDATA ''
\ No newline at end of file
+!ATTLIST plugin order CDATA ''
+
+!ELEMENT aliases (alias+)
+!ELEMENT alias EMPTY
+!ATTLIST alias name CDATA #REQUIRED
+!ATTLIST alias extension-id CDATA #REQUIRED
\ No newline at end of file

Modified: lucene/nutch/trunk/conf/parse-plugins.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.xml?rev=379403r1=379402r2=379403view=diff
==
--- lucene/nutch/trunk/conf/parse-plugins.xml (original)
+++ lucene/nutch/trunk/conf/parse-plugins.xml Tue Feb 21 01:54:21 2006
@@ -218,4 +218,33 @@
plugin id=parse-ext /
/mimeType
 
+   !--  alias mappings for parse-xxx names to the actual extension 
implementation 
+   ids described in each plugin's plugin.xml file --
+   aliases
+   alias name=parse-ext extension-id=ExtParser /
+   alias name=parse-html
+   extension-id=org.apache.nutch.parse.html.HtmlParser /
+   alias name=parse-js extension-id=JSParser /
+   alias name=parse-mp3
+   extension-id=org.apache.nutch.parse.mp3.MP3Parser /
+   alias name=parse-msexcel
+   
extension-id=org.apache.nutch.parse.msexcel.MSExcelParser /
+   alias name=parse-mspowerpoint
+   
extension-id=org.apache.nutch.parse.mspowerpoint.MSPowerPointParser /
+   alias name=parse-msword
+   
extension-id=org.apache.nutch.parse.msword.MSWordParser /
+   alias name=parse-pdf
+   extension-id=org.apache.nutch.parse.pdf.PdfParser /
+   alias name=parse-rss
+   extension-id=org.apache.nutch.parse.rss.RSSParser /
+   alias name=parse-rtf
+   
extension-id=org.apache.nutch.parse.rtf.RTFParseFactory /
+   alias name=parse-swf
+   extension-id=org.apache.nutch.parse.swf.SWFParser /
+   alias name=parse-text
+   extension-id=org.apache.nutch.parse.text.TextParser /
+   alias name=parse-zip
+   extension-id=org.apache.nutch.parse.zip.ZipParser /
+   /aliases
+   
 /parse-plugins

svn commit: r379419 - in /lucene/nutch/trunk: site/mailing_lists.html site/mailing_lists.pdf src/site/src/documentation/content/xdocs/mailing_lists.xml

2006-02-21 Thread jerome
Author: jerome
Date: Tue Feb 21 03:10:42 2006
New Revision: 379419

URL: http://svn.apache.org/viewcvs?rev=379419view=rev
Log:
NUTCH-214, Add a search mailing list archive link (Jake Vanderdray)

Modified:
lucene/nutch/trunk/site/mailing_lists.html
lucene/nutch/trunk/site/mailing_lists.pdf

lucene/nutch/trunk/src/site/src/documentation/content/xdocs/mailing_lists.xml

Modified: lucene/nutch/trunk/site/mailing_lists.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/site/mailing_lists.html?rev=379419r1=379418r2=379419view=diff
==
--- lucene/nutch/trunk/site/mailing_lists.html (original)
+++ lucene/nutch/trunk/site/mailing_lists.html Tue Feb 21 03:10:42 2006
@@ -3,7 +3,7 @@
 head
 META http-equiv=Content-Type content=text/html; charset=UTF-8
 meta content=Apache Forrest name=Generator
-meta name=Forrest-version content=0.6
+meta name=Forrest-version content=0.7
 meta name=Forrest-skin-name content=pelt
 titleNutch Mailing Lists/title
 link type=text/css href=skin/basic.css rel=stylesheet
@@ -16,49 +16,22 @@
 body onload=init()
 script type=text/javascriptndeSetTextSize();/script
 div id=top
-!--+
-|breadtrail
-+--
 div class=breadtrail
 a href=http://www.apache.org/;Apache/a gt; a 
href=http://lucene.apache.org/;Lucene/a gt; a 
href=http://lucene.apache.org/nutch/;Nutch/ascript 
src=skin/breadcrumbs.js language=JavaScript type=text/javascript/script
 /div
-!--+
-|header
-+--
 div class=header
-!--+
-|start group logo
-+--
 div class=grouplogo
 a href=http://lucene.apache.org/;img class=logoImage alt=Lucene 
src=http://lucene.apache.org/java/docs/images/lucene_green_150.gif; 
title=Apache Lucene/a
 /div
-!--+
-|end group logo
-+--
-!--+
-|start Project Logo
-+--
 div class=projectlogo
 a href=http://lucene.apache.org/nutch/;img class=logoImage alt=Nutch 
src=images/nutch-logo.gif title=Open Source Web Search Software/a
 /div
-!--+
-|end Project Logo
-+--
-!--+
-|start Search
-+--
 div class=searchbox
 form action=http://www.google.com/search; method=get class=roundtopsmall
-input value=lucene.apache.org name=sitesearch type=hiddeninput 
onFocus=getBlank (this, 'Search the site with google:'); value=Search the 
site with google: size=25 name=q id=query type=textnbsp; 
-input name=Search value=Search type=submit
+input value=lucene.apache.org name=sitesearch type=hiddeninput 
onFocus=getBlank (this, 'Search the site with google'); size=25 name=q 
id=query type=text value=Search the site with googlenbsp; 
+input attr=value name=Search value=Search 
type=submit
 /form
 /div
-!--+
-|end search
-+--
-!--+
-|start Tabs
-+--
 ul id=tabs
 li class=current
 a class=base-selected href=index.htmlMain/a
@@ -67,110 +40,83 @@
 a class=base-not-selected href=http://wiki.apache.org/nutch/;Wiki/a
 /li
 /ul
-!--+
-|end Tabs
-+--
 /div
 /div
 div id=main
 div id=publishedStrip
-!--+
-|start Subtabs
-+--
 div id=level2tabs/div
-!--+
-|end Endtabs
-+--
-script type=text/javascript language=JavaScript!--
-  document.write(Published:  + document.lastModified);
-  //  --/script
-/div
-!--+
-|breadtrail
-+--
+script type=text/javascript!--
+document.write(textLast Published:/text  + document.lastModified);
+//  --/script
+/div
 div class=breadtrail
  
  nbsp;
/div
-!--+
-|start Menu, mainarea
-+--
-!--+
-|start Menu
-+--
 div id=menu
 div onclick=SwitchMenu('menu_1.1', 'skin/') id=menu_1.1Title 
class=menutitleProject/div
 div id=menu_1.1 class=menuitemgroup
 div class=menuitem
-a title= href=index.htmlNews/a
+a href=index.htmlNews/a
 /div
 div class=menuitem
-a title= href=about.htmlAbout/a
+a href=about.htmlAbout/a
 /div
 div class=menuitem
-a title= href=credits.htmlCredits/a
+a href=credits.htmlCredits/a
 /div
 div class=menuitem
-a title= href=http://www.cafepress.com/nutch/;Buy Stuff/a
+a href=http://www.cafepress.com/nutch/;Buy Stuff/a
 /div
 /div
 div onclick=SwitchMenu('menu_1.2', 'skin/') id=menu_1.2Title 
class=menutitleDocumentation/div
 div id=menu_1.2 class=menuitemgroup
 div class=menuitem
-a title= href=http://wiki.apache.org/nutch/FAQ;FAQ/a
+a href=http://wiki.apache.org/nutch/FAQ;FAQ/a
 /div
 div class=menuitem
-a title= href=http://wiki.apache.org/nutch/;Wiki/a
+a href=http://wiki.apache.org/nutch/;Wiki/a
 /div
 div class=menuitem
-a title= href=tutorial.htmlTutorial/a
+a href=tutorial.htmlTutorial/a
 /div
 div class=menuitem
-a title= href=bot.htmlRobot /a
+a href=bot.htmlRobot /a
 /div
 div class=menuitem
-a title= href=i18n.htmli18n/a
+a href=i18n.htmli18n/a
 /div
 div class=menuitem
-a title= href=apidocs/index.htmlAPI Docs/a
+a href=apidocs/index.htmlAPI Docs/a
 /div
 /div
 div onclick=SwitchMenu('menu_selected_1.3', 'skin/') 
id=menu_selected_1.3Title class=menutitle style=background-image

svn commit: r379511 - in /lucene/nutch/trunk: docs/fr/help.html src/web/pages/fr/help.xml

2006-02-21 Thread jerome
Author: jerome
Date: Tue Feb 21 08:05:07 2006
New Revision: 379511

URL: http://svn.apache.org/viewcvs?rev=379511view=rev
Log:
Add fr help page

Added:
lucene/nutch/trunk/docs/fr/help.html   (with props)
lucene/nutch/trunk/src/web/pages/fr/help.xml   (with props)

Added: lucene/nutch/trunk/docs/fr/help.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/docs/fr/help.html?rev=379511view=auto
==
--- lucene/nutch/trunk/docs/fr/help.html (added)
+++ lucene/nutch/trunk/docs/fr/help.html Tue Feb 21 08:05:07 2006
@@ -0,0 +1,161 @@
+!DOCTYPE HTML PUBLIC -//W3C//DTD HTML 4.01 Transitional//EN
+html!--This page is automatically generated.  Do not edit!--
+head
+META http-equiv=Content-Type content=text/html; charset=UTF-8
+titleNutch: aide/title
+style type=text/css
+.menuTd {background-color: #F9F7F4; height: 25; onMouseOver: 
this.style.backgroundColor='#ECE5DC';}
+.menuTdhover {background-color: #ECE5DC; height: 25; onMouseOver: 
this.style.backgroundColor='#ECE5DC';}
+.menuEntry {  font-family: Arial, Helvetica, sans-serif; font-size: 12px; 
color: #00; text-decoration: none}
+.body {background-color: #F9F7F4;}
+.bodytext {  font-family: Arial, Helvetica, sans-serif; font-size: 12px; 
color: #00; text-decoration: none}
+.title {  font-family: Arial, Helvetica, sans-serif; font-size: 26px; color: 
#FF9900; text-decoration: none}
+.intro {  font-family: Arial, Helvetica, sans-serif; font-size: 12px; color: 
#FF9900; text-decoration: none}
+.orangeTd {background-color: #FF9900}
+ul {list-style-image: url(../img/reiter/ul.gif)}
+h3 {font-family: Arial, Helvetica, sans-serif; font-size: 16px; color: 
#00;}
+h4 {font-family: Arial, Helvetica, sans-serif; font-size: 14px; color: 
#00;}
+.url {color: #996600;}
+.highlight {font-weight: bold;}
+.ellipsis {font-weight: bold;}
+/style
+link type=image/x-icon href=../img/favicon.ico rel=icon
+link type=image/x-icon href=../img/favicon.ico rel=shortcut icon
+script type=text/javascript
+!--
+function queryfocus() { document.search.query.focus(); }
+// --/script
+/head
+body onLoad=queryfocus();
+!--This file is automatically generated.  Do not edit!--
+table cellspacing=0 cellpadding=0 border=0 width=635
+tr
+td rowspan=2 width=140 valign=bottoma href=./img border=0 
src=../img/reiter/logo_nutch.gif/aimg height=1 width=140 
src=../img/reiter/spacer_66.gif/td
+/tr
+tr
+td align=right valign=bottom width=495
+table width=495 cellspacing=0 cellpadding=0 border=0
+tr
+td width=400 background=../img/reiter/_bg_reiter.gifnbsp;/tdtd 
width=10 valign=bottom height=28img border=0 
src=../img/reiter/reiter_inactive_le1.gif/tdtd nowrap=nowrap 
valign=bottom background=../img/reiter/_bg_reiter_inactive.gifa 
href=about.html class=bodytextA propos/a/tdtd width=10 
valign=bottom height=28img border=0 
src=../img/reiter/reiter_inactive_ri.gif/tdtd width=10 valign=bottom 
height=28img border=0 src=../img/reiter/reiter_inactive_le.gif/tdtd 
nowrap=nowrap valign=bottom 
background=../img/reiter/_bg_reiter_inactive.gifa 
href=http://wiki.apache.org/nutch/FAQ; class=bodytextQuestions 
freacute;quentes/a/tdtd width=10 valign=bottom height=28img 
border=0 src=../img/reiter/reiter_inactive_ri.gif/td
+/tr
+/table
+/td
+/tr
+/table
+table cellspacing=0 cellpadding=0 border=0 width=635
+tr valign=top
+td width=140
+table cellspacing=0 cellpadding=0 width=100%
+tr
+td#160;/td
+/tr
+/table
+/tdtd background=../img/reiter/_spacer_cc.gif 
width=20#160;/tdtd class=body width=475
+table cellspacing=0 cellpadding=0 border=0 width=475
+tr
+td valign=bottom width=275 height=125 class=titleaide/tdtd 
valign=bottom width=200 height=125img 
src=../img/reiter/robots.gif/td
+/tr
+/table
+br class=br
+span class=bodytext
+
+/spanspan class=bodytext
+h3Requecirc;tes/h3
+/spanspan class=bodytext
+Pour effectuer une recherche avec Nutch, il suffit de saisir quelques mots.
+/spanspan class=bodytext
+ul
+  
+liLes reacute;sultats contiendront uniquement les pages qui contiennent
+span style=font-style: italic;tous/span les mots de la question./li
+  
+liUtilisez les doubles chevrons autour des termes qui doivent ecirc;tre 
adjacents,
+comme dans le cas d'une phrase. Par exemple span style=font-weight: bold;
+Nouvelle Zeacute;lande/span./li
+  
+liLa ponctuation entre les mots est ignoreacute;e. Ainsi, la recherche 
+span style=font-weight: bold;http://www.nutch.org//span est 
eacute;quivalente agrave;
+span style=font-weight: bold;http www nutch org/span./li
+  
+liLa recherche n'est pas sensible agrave; la casse. Ainsi, la recherche
+span style=font-weight: bold;NuTcH/span est eacute;quivalente agrave; 
span style=font-weight: bold;nUtCh/span./li
+  
+liVous pouvez exclure un mot des reacute;sultats en placcedil;ant un signe 
moins
+devant. Ainsi, la recherche span style=font-weight: bold;football
+-nfl/span retournera les pages qui parlent de football, mais qui ne 
contiennent
+pas le mot nfl./li
+
+/ul

svn commit: r378653 - in /lucene/nutch/trunk/src/plugin/parse-rtf/src: java/org/apache/nutch/parse/rtf/RTFParseFactory.java java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java test/org/apache/n

2006-02-17 Thread jerome
Author: jerome
Date: Fri Feb 17 15:22:55 2006
New Revision: 378653

URL: http://svn.apache.org/viewcvs?rev=378653view=rev
Log:
Adapt parse-rtf to nutch APIs changes (metadata, parse, protocol, ...)

Modified:

lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java

lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java

lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java

Modified: 
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java?rev=378653r1=378652r2=378653view=diff
==
--- 
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
 Fri Feb 17 15:22:55 2006
@@ -13,38 +13,42 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.rtf;
 
-import org.apache.nutch.parse.*;
-import org.apache.nutch.protocol.Content;
- .mine
- .mine
-import org.apache.nutch.util.MetadataNames;
-
-===
-import org.apache.nutch.util.NutchConf;
-===
-import org.apache.hadoop.conf.Configuration;
- .r374853
- .r373941
+// JDK imports
 import java.io.ByteArrayInputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
-import java.util.Properties;
 
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.OutlinkExtractor;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+
+// RTF Parser imports
+import com.etranslate.tm.processing.rtf.ParseException;
 import com.etranslate.tm.processing.rtf.RTFParser;
 
+
 /**
  * A parser for RTF documents
  * 
  * @author Andy Hedges
  */
-public class RTFParseFactory implements Parser, MetadataNames {
+public class RTFParseFactory implements Parser {
 
   private Configuration conf;
 
-  public Parse getParse(Content content) throws ParseException {
+  public Parse getParse(Content content) {
 byte[] raw = content.getContent();
 Reader reader = new InputStreamReader(new ByteArrayInputStream(raw));
 RTFParserDelegateImpl delegate = new RTFParserDelegateImpl();
@@ -55,28 +59,31 @@
 
 try {
   rtfParser.parse();
-} catch (com.etranslate.tm.processing.rtf.ParseException e) {
-  throw new ParseException(Exception parsing RTF document, e);
+} catch (ParseException e) {
+return new ParseStatus(ParseStatus.FAILED,
+   ParseStatus.FAILED_EXCEPTION,
+   e.toString()).getEmptyParse(conf);
 }
 
-Properties metadata = new Properties();
-metadata.putAll(content.getMetadata());
-metadata.putAll(delegate.getMetaData());
-String title = metadata.getProperty(TITLE);
+Metadata metadata = new Metadata();
+metadata.setAll(delegate.getMetaData());
+String title = metadata.get(DublinCore.TITLE);
 
 if (title != null) {
-//(CM): Why remove the title metadata property here? Even 
-//though it's stored in the ParseData, it still might be useful
-//to have via this properties object?
-//metadata.remove(title);
+  metadata.remove(DublinCore.TITLE);
 } else {
   title = ;
 }
 
 String text = delegate.getText();
 
-return new ParseImpl(text, new ParseData(title, OutlinkExtractor
-.getOutlinks(text, this.conf), metadata));
+return new ParseImpl(text,
+ new ParseData(ParseStatus.STATUS_SUCCESS,
+   title,
+   OutlinkExtractor
+.  getOutlinks(text, this.conf),
+   content.getMetadata(),
+   metadata));
   }
 
   public void setConf(Configuration conf) {

Modified: 
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java?rev=378653r1=378652r2=378653view=diff
==
--- 
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java
 (original

svn commit: r378655 - in /lucene/nutch/trunk/src/plugin: ./ analysis-de/ analysis-fr/ clustering-carrot2/ creativecommons/ index-basic/ index-more/ languageidentifier/ lib-commons-httpclient/ lib-http

2006-02-17 Thread jerome
Author: jerome
Date: Fri Feb 17 15:28:39 2006
New Revision: 378655

URL: http://svn.apache.org/viewcvs?rev=378655view=rev
Log:
Review plugins building and testing

Modified:
lucene/nutch/trunk/src/plugin/analysis-de/build.xml
lucene/nutch/trunk/src/plugin/analysis-fr/build.xml
lucene/nutch/trunk/src/plugin/build-plugin.xml
lucene/nutch/trunk/src/plugin/build.xml
lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml
lucene/nutch/trunk/src/plugin/creativecommons/build.xml
lucene/nutch/trunk/src/plugin/index-basic/build.xml
lucene/nutch/trunk/src/plugin/index-more/build.xml
lucene/nutch/trunk/src/plugin/languageidentifier/build.xml
lucene/nutch/trunk/src/plugin/lib-commons-httpclient/build.xml
lucene/nutch/trunk/src/plugin/lib-http/build.xml
lucene/nutch/trunk/src/plugin/lib-jakarta-poi/build.xml
lucene/nutch/trunk/src/plugin/lib-log4j/build.xml
lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/build.xml
lucene/nutch/trunk/src/plugin/lib-nekohtml/build.xml
lucene/nutch/trunk/src/plugin/lib-parsems/build.xml
lucene/nutch/trunk/src/plugin/ontology/build.xml
lucene/nutch/trunk/src/plugin/parse-ext/build.xml
lucene/nutch/trunk/src/plugin/parse-html/build.xml
lucene/nutch/trunk/src/plugin/parse-js/build.xml
lucene/nutch/trunk/src/plugin/parse-mp3/build.xml
lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml
lucene/nutch/trunk/src/plugin/parse-msword/build.xml
lucene/nutch/trunk/src/plugin/parse-pdf/build.xml
lucene/nutch/trunk/src/plugin/parse-rss/build.xml
lucene/nutch/trunk/src/plugin/parse-rtf/build.xml
lucene/nutch/trunk/src/plugin/parse-swf/build.xml
lucene/nutch/trunk/src/plugin/parse-text/build.xml
lucene/nutch/trunk/src/plugin/parse-zip/build.xml
lucene/nutch/trunk/src/plugin/protocol-file/build.xml
lucene/nutch/trunk/src/plugin/protocol-ftp/build.xml
lucene/nutch/trunk/src/plugin/protocol-http/build.xml
lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml
lucene/nutch/trunk/src/plugin/query-basic/build.xml
lucene/nutch/trunk/src/plugin/query-more/build.xml
lucene/nutch/trunk/src/plugin/query-site/build.xml
lucene/nutch/trunk/src/plugin/query-url/build.xml
lucene/nutch/trunk/src/plugin/urlfilter-prefix/build.xml
lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml

Modified: lucene/nutch/trunk/src/plugin/analysis-de/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/analysis-de/build.xml?rev=378655r1=378654r2=378655view=diff
==
--- lucene/nutch/trunk/src/plugin/analysis-de/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/analysis-de/build.xml Fri Feb 17 15:28:39 2006
@@ -4,9 +4,16 @@
 
   import file=../build-plugin.xml/
 
+  !-- Build compilation dependencies --
+  target name=deps-jar
+ant target=compile-core inheritall=false dir=${nutch.root}/
+ant target=jar inheritall=false dir=../lib-lucene-analyzers/
+  /target
+
+  !-- Add compilation dependencies to classpath --
   path id=plugin.deps
-fileset dir=../lib-lucene-analyzers/lib
-  include name=*.jar /
+fileset dir=${nutch.root}/build
+  include name=**/lib-lucene-analyzers/*.jar /
 /fileset
   /path
 

Modified: lucene/nutch/trunk/src/plugin/analysis-fr/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/analysis-fr/build.xml?rev=378655r1=378654r2=378655view=diff
==
--- lucene/nutch/trunk/src/plugin/analysis-fr/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/analysis-fr/build.xml Fri Feb 17 15:28:39 2006
@@ -4,9 +4,16 @@
 
   import file=../build-plugin.xml/
 
+  !-- Build compilation dependencies --
+  target name=deps-jar
+ant target=compile-core inheritall=false dir=${nutch.root}/
+ant target=jar inheritall=false dir=../lib-lucene-analyzers/
+  /target
+
+  !-- Add compilation dependencies to classpath --
   path id=plugin.deps
-fileset dir=../lib-lucene-analyzers/lib
-  include name=*.jar /
+fileset dir=${nutch.root}/build
+  include name=**/lib-lucene-analyzers/*.jar /
 /fileset
   /path
 

Modified: lucene/nutch/trunk/src/plugin/build-plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build-plugin.xml?rev=378655r1=378654r2=378655view=diff
==
--- lucene/nutch/trunk/src/plugin/build-plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/build-plugin.xml Fri Feb 17 15:28:39 2006
@@ -68,10 +68,22 @@
   !-- to be overridden by sub-projects -- 
   target name=init-plugin/
 
+  !--
+   ! Used to build plugin compilation dependencies
+   ! (to be overridden by plugins)
+   !--
+  target name=deps-jar/
+
+  !--
+   ! Used to deploy plugin runtime

svn commit: r378667 - in /lucene/nutch/trunk/src/plugin/parse-mp3/src: java/org/apache/nutch/parse/mp3/MP3Parser.java java/org/apache/nutch/parse/mp3/MetadataCollector.java test/org/apache/nutch/parse

2006-02-17 Thread jerome
Author: jerome
Date: Fri Feb 17 16:23:35 2006
New Revision: 378667

URL: http://svn.apache.org/viewcvs?rev=378667view=rev
Log:
Adapts parse-mp3 to nutch APIs changes (metadata, parse, protocol, ...)

Modified:

lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java

lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java

lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java

Modified: 
lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java?rev=378667r1=378666r2=378667view=diff
==
--- 
lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java
 Fri Feb 17 16:23:35 2006
@@ -16,10 +16,14 @@
 
 package org.apache.nutch.parse.mp3;
 
+// JDK imports
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.util.Iterator;
 
-import org.apache.nutch.parse.*;
-import org.apache.nutch.protocol.Content;
-import org.apache.hadoop.conf.Configuration;
+// Java ID3 Tag imports
 import org.farng.mp3.MP3File;
 import org.farng.mp3.TagException;
 import org.farng.mp3.id3.AbstractID3v2;
@@ -27,29 +31,35 @@
 import org.farng.mp3.id3.ID3v1;
 import org.farng.mp3.object.AbstractMP3Object;
 
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.net.MalformedURLException;
-import java.util.Iterator;
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+
 
 /**
  * A parser for MP3 audio files
  * @author Andy Hedges
  */
-
 public class MP3Parser implements Parser {
 
   private MetadataCollector metadataCollector;
   private Configuration conf;
 
-  public Parse getParse(Content content) throws ParseException {
-Parse parse = null;
-metadataCollector.putAll(content.getMetadata());
+  public Parse getParse(Content content) {
 
+Parse parse = null;
 byte[] raw = content.getContent();
-
 File tmp = null;
+
 try {
   tmp = File.createTempFile(nutch, .mp3);
   FileOutputStream fos = new FileOutputStream(tmp);
@@ -58,25 +68,31 @@
   MP3File mp3 = new MP3File(tmp);
 
   if (mp3.hasID3v2Tag()) {
-parse = getID3v2Parse(mp3);
+parse = getID3v2Parse(mp3, content.getMetadata());
   } else if (mp3.hasID3v1Tag()) {
-parse = getID3v1Parse(mp3);
+parse = getID3v1Parse(mp3, content.getMetadata());
   } else {
-throw new ParseException(No textual content available);
+return new ParseStatus(ParseStatus.FAILED,
+   ParseStatus.FAILED_MISSING_CONTENT,
+   No textual content 
available).getEmptyParse(conf);
   }
-
-
 } catch (IOException e) {
-  throw new ParseException(Couldn't create temporary file, e);
+  return new ParseStatus(ParseStatus.FAILED,
+ ParseStatus.FAILED_EXCEPTION,
+ Couldn't create temporary file: + 
e).getEmptyParse(conf);
 } catch (TagException e) {
-  throw new ParseException(ID3 Tags could not be parsed, e);
+  return new ParseStatus(ParseStatus.FAILED,
+ ParseStatus.FAILED_EXCEPTION,
+ ID3 Tags could not be parsed: + 
e).getEmptyParse(conf);
 } finally{
   tmp.delete();
 }
 return parse;
   }
 
-  private Parse getID3v1Parse(MP3File mp3) throws MalformedURLException {
+  private Parse getID3v1Parse(MP3File mp3, Metadata contentMeta)
+  throws MalformedURLException {
+
 ID3v1 tag = mp3.getID3v1Tag();
 metadataCollector.notifyProperty(TALB-Text, tag.getAlbum());
 metadataCollector.notifyProperty(TPE1-Text, tag.getArtist());
@@ -84,13 +100,17 @@
 metadataCollector.notifyProperty(TCON-Text, ( + tag.getGenre() + ));
 metadataCollector.notifyProperty(TIT2-Text, tag.getTitle());
 metadataCollector.notifyProperty(TYER-Text, tag.getYear());
-ParseData parseData = new ParseData(metadataCollector.getTitle(),
-metadataCollector.getOutlinks(),
-metadataCollector.getData(), getConf());
+ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS

svn commit: r378214 - in /lucene/nutch/trunk/src/plugin: ./ clustering-carrot2/ lib-commons-httpclient/ lib-commons-httpclient/lib/ parse-rss/lib/ protocol-httpclient/ protocol-httpclient/lib/

2006-02-16 Thread jerome
Author: jerome
Date: Thu Feb 16 02:10:23 2006
New Revision: 378214

URL: http://svn.apache.org/viewcvs?rev=378214view=rev
Log:
NUTCH-196 : add a httpclient library (lib-commons-httpclient)

Added:
lucene/nutch/trunk/src/plugin/lib-commons-httpclient/
lucene/nutch/trunk/src/plugin/lib-commons-httpclient/build.xml   (with 
props)
lucene/nutch/trunk/src/plugin/lib-commons-httpclient/lib/

lucene/nutch/trunk/src/plugin/lib-commons-httpclient/lib/commons-httpclient-3.0.jar
   (with props)
lucene/nutch/trunk/src/plugin/lib-commons-httpclient/plugin.xml   (with 
props)
Removed:
lucene/nutch/trunk/src/plugin/parse-rss/lib/commons-httpclient-3.0-beta1.jar

lucene/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-httpclient-3.0.jar
Modified:
lucene/nutch/trunk/src/plugin/build.xml
lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml
lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml
lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml
lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=378214r1=378213r2=378214view=diff
==
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Thu Feb 16 02:10:23 2006
@@ -11,6 +11,7 @@
  ant dir=index-basic target=deploy/
  ant dir=index-more target=deploy/
  ant dir=languageidentifier target=deploy/
+ ant dir=lib-commons-httpclient target=deploy/
  ant dir=lib-http target=deploy/
  ant dir=lib-jakarta-poi target=deploy/
  ant dir=lib-log4j target=deploy/
@@ -77,6 +78,7 @@
 ant dir=index-basic target=clean/
 ant dir=index-more target=clean/
 ant dir=languageidentifier target=clean/
+ant dir=lib-commons-httpclient target=clean/
 ant dir=lib-http target=clean/
 ant dir=lib-jakarta-poi target=clean/
 ant dir=lib-log4j target=clean/

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml?rev=378214r1=378213r2=378214view=diff
==
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml Thu Feb 16 
02:10:23 2006
@@ -5,8 +5,9 @@
   import file=../build-plugin.xml/
 
   path id=plugin.deps
-fileset dir=../lib-log4j/lib
-  include name=*.jar /
+fileset dir=${nutch.root}/build
+  include name=**/lib-log4j/*.jar /
+  include name=**/lib-commons-httpclient/*.jar /
 /fileset
   /path
 

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml?rev=378214r1=378213r2=378214view=diff
==
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml Thu Feb 16 
02:10:23 2006
@@ -30,6 +30,7 @@
requires
   import plugin=nutch-extensionpoints/
   import plugin=lib-log4j/
+  import plugin=lib-commons-httpclient/
/requires
 
extension id=org.apache.nutch.clustering.carrot2

Added: lucene/nutch/trunk/src/plugin/lib-commons-httpclient/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-commons-httpclient/build.xml?rev=378214view=auto
==
--- lucene/nutch/trunk/src/plugin/lib-commons-httpclient/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/lib-commons-httpclient/build.xml Thu Feb 16 
02:10:23 2006
@@ -0,0 +1,21 @@
+?xml version=1.0?
+
+project name=lib-commons-httpclient default=jar
+
+  import file=../build-plugin.xml/
+
+  !--
+   ! Override the compile and jar targets,
+   ! since there is nothing to compile here.   
  
+   ! --
+  target name=compile depends=init
+echo message=Compiling plugin: ${name}/
+  /target
+
+  target name=jar depends=compile
+copy todir=${build.dir} verbose=true
+  fileset dir=./lib includes=**/*.jar/
+/copy
+  /target
+
+/project

Propchange: lucene/nutch/trunk/src/plugin/lib-commons-httpclient/build.xml
--
svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/lib-commons-httpclient/lib/commons-httpclient-3.0.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-commons-httpclient/lib/commons-httpclient-3.0.jar?rev=378214view=auto
==
Binary file - no diff available.

Propchange: 
lucene/nutch/trunk

svn commit: r378215 - /lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml

2006-02-16 Thread jerome
Author: jerome
Date: Thu Feb 16 02:22:10 2006
New Revision: 378215

URL: http://svn.apache.org/viewcvs?rev=378215view=rev
Log:
remove the unused httpclient library declaration

Modified:
lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml

Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml?rev=378215r1=378214r2=378215view=diff
==
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml Thu Feb 16 
02:22:10 2006
@@ -10,7 +10,6 @@
  export name=*/
   /library
   library name=commons-codec.jar /
-  library name=commons-httpclient-3.0.jar /
/runtime
 
requires




svn commit: r378222 - /lucene/nutch/trunk/src/plugin/build.xml

2006-02-16 Thread jerome
Author: jerome
Date: Thu Feb 16 02:51:14 2006
New Revision: 378222

URL: http://svn.apache.org/viewcvs?rev=378222view=rev
Log:
add lib-nekohtml to deploy and clean and force the lib plugins to be builded 
first

Modified:
lucene/nutch/trunk/src/plugin/build.xml

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=378222r1=378221r2=378222view=diff
==
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Thu Feb 16 02:51:14 2006
@@ -6,17 +6,18 @@
   !-- Build  deploy all the plugin jars.--
   !-- == --
   target name=deploy
- ant dir=clustering-carrot2 target=deploy/
- ant dir=creativecommons target=deploy/
- ant dir=index-basic target=deploy/
- ant dir=index-more target=deploy/
- ant dir=languageidentifier target=deploy/
  ant dir=lib-commons-httpclient target=deploy/
  ant dir=lib-http target=deploy/
  ant dir=lib-jakarta-poi target=deploy/
  ant dir=lib-log4j target=deploy/
  ant dir=lib-lucene-analyzers target=deploy/
+ ant dir=lib-nekohtml target=deploy/
  ant dir=lib-parsems target=deploy/
+ ant dir=clustering-carrot2 target=deploy/
+ ant dir=creativecommons target=deploy/
+ ant dir=index-basic target=deploy/
+ ant dir=index-more target=deploy/
+ ant dir=languageidentifier target=deploy/
  ant dir=nutch-extensionpoints target=deploy/
  ant dir=ontology target=deploy/
  ant dir=protocol-file target=deploy/
@@ -83,6 +84,7 @@
 ant dir=lib-jakarta-poi target=clean/
 ant dir=lib-log4j target=clean/
 ant dir=lib-lucene-analyzers target=clean/
+ant dir=lib-nekohtml target=clean/
 ant dir=lib-parsems target=clean/
 ant dir=nutch-extensionpoints target=clean/
 ant dir=ontology target=clean/




svn commit: r378011 - in /lucene/nutch/trunk/src/plugin: ./ clustering-carrot2/ clustering-carrot2/lib/ lib-log4j/ lib-log4j/lib/ parse-pdf/ parse-pdf/lib/ parse-rss/ parse-rss/lib/

2006-02-15 Thread jerome
Author: jerome
Date: Wed Feb 15 06:24:56 2006
New Revision: 378011

URL: http://svn.apache.org/viewcvs?rev=378011view=rev
Log:
Add a log4j library plugin (lib-log4j)

Added:
lucene/nutch/trunk/src/plugin/lib-log4j/
lucene/nutch/trunk/src/plugin/lib-log4j/build.xml   (with props)
lucene/nutch/trunk/src/plugin/lib-log4j/lib/
lucene/nutch/trunk/src/plugin/lib-log4j/lib/log4j-1.2.11.jar   (with props)
lucene/nutch/trunk/src/plugin/lib-log4j/plugin.xml   (with props)
Removed:
lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/log4j-1.2.11.jar
lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/log4j.LICENSE
lucene/nutch/trunk/src/plugin/parse-pdf/lib/log4j-1.2.9.jar
lucene/nutch/trunk/src/plugin/parse-pdf/lib/log4j-LICENSE.txt
lucene/nutch/trunk/src/plugin/parse-rss/lib/log4j-1.2.6.jar
Modified:
lucene/nutch/trunk/src/plugin/build.xml
lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml
lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml
lucene/nutch/trunk/src/plugin/parse-pdf/build.xml
lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml
lucene/nutch/trunk/src/plugin/parse-rss/build.xml
lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=378011r1=378010r2=378011view=diff
==
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Wed Feb 15 06:24:56 2006
@@ -13,6 +13,7 @@
  ant dir=languageidentifier target=deploy/
  ant dir=lib-http target=deploy/
  ant dir=lib-jakarta-poi target=deploy/
+ ant dir=lib-log4j target=deploy/
  ant dir=lib-lucene-analyzers target=deploy/
  ant dir=lib-parsems target=deploy/
  ant dir=nutch-extensionpoints target=deploy/
@@ -78,6 +79,7 @@
 ant dir=languageidentifier target=clean/
 ant dir=lib-http target=clean/
 ant dir=lib-jakarta-poi target=clean/
+ant dir=lib-log4j target=clean/
 ant dir=lib-lucene-analyzers target=clean/
 ant dir=lib-parsems target=clean/
 ant dir=nutch-extensionpoints target=clean/

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml?rev=378011r1=378010r2=378011view=diff
==
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml Wed Feb 15 
06:24:56 2006
@@ -4,4 +4,10 @@
 
   import file=../build-plugin.xml/
 
+  path id=plugin.deps
+fileset dir=../lib-log4j/lib
+  include name=*.jar /
+/fileset
+  /path
+
 /project

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml?rev=378011r1=378010r2=378011view=diff
==
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml Wed Feb 15 
06:24:56 2006
@@ -29,6 +29,7 @@
 
requires
   import plugin=nutch-extensionpoints/
+  import plugin=lib-log4j/
/requires
 
extension id=org.apache.nutch.clustering.carrot2

Added: lucene/nutch/trunk/src/plugin/lib-log4j/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-log4j/build.xml?rev=378011view=auto
==
--- lucene/nutch/trunk/src/plugin/lib-log4j/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/lib-log4j/build.xml Wed Feb 15 06:24:56 2006
@@ -0,0 +1,17 @@
+?xml version=1.0?
+
+project name=lib-log4j default=jar
+
+  import file=../build-plugin.xml/
+
+  !--
+   ! Override the compile and jar targets,
+   ! since there is nothing to compile here.
+   ! --
+  target name=compile depends=init
+echo message=Compiling plugin: ${name}/
+  /target
+
+  target name=jar depends=compile/
+
+/project

Propchange: lucene/nutch/trunk/src/plugin/lib-log4j/build.xml
--
svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/lib-log4j/lib/log4j-1.2.11.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-log4j/lib/log4j-1.2.11.jar?rev=378011view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/lib-log4j/lib/log4j-1.2.11.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/lib-log4j/plugin.xml
URL: 
http://svn.apache.org

svn commit: r377494 - in /lucene/nutch/trunk/src/plugin: parse-msexcel/ parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ parse-mspowerpoint/ parse-mspowerpoint/src/java/org/apache/nutch/parse/ms

2006-02-13 Thread jerome
Author: jerome
Date: Mon Feb 13 13:28:13 2006
New Revision: 377494

URL: http://svn.apache.org/viewcvs?rev=377494view=rev
Log:
Make use of lib-parsems in word, powerpoint and excel parsers

Removed:

lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java

lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java

lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java
Modified:
lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml
lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml

lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java

lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java

lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml

lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java

lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java

lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html
lucene/nutch/trunk/src/plugin/parse-msword/build.xml
lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml

lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/FastSavedException.java

lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java

lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/PasswordProtectedException.java

lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java

lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/package.html

Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml?rev=377494r1=377493r2=377494view=diff
==
--- lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml Mon Feb 13 13:28:13 
2006
@@ -2,19 +2,23 @@
 
 project name=parse-msexcel default=jar
 
-   import file=../build-plugin.xml /
+  import file=../build-plugin.xml /
 
   path id=plugin.deps
 fileset dir=../lib-jakarta-poi/lib
   include name=*.jar /
 /fileset
+fileset dir=../../../build/lib-parsems
+  include name=*.jar /
+/fileset
   /path
 
-   !-- for junit test --
-   mkdir dir=${build.test}/data /
-   copy todir=${build.test}/data
-   fileset dir=sample
-   include name=*.xls /
-   /fileset
-   /copy
+  !-- for junit test --
+  mkdir dir=${build.test}/data /
+  copy todir=${build.test}/data
+fileset dir=sample
+  include name=*.xls /
+/fileset
+  /copy
+
 /project

Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml?rev=377494r1=377493r2=377494view=diff
==
--- lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml Mon Feb 13 13:28:13 
2006
@@ -14,6 +14,7 @@
requires
  import plugin=nutch-extensionpoints/
  import plugin=lib-jakarta-poi/
+ import plugin=lib-parsems/
/requires
 
extension id=org.apache.nutch.parse.msexcel

Modified: 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java?rev=377494r1=377493r2=377494view=diff
==
--- 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
 Mon Feb 13 13:28:13 2006
@@ -16,17 +16,17 @@
 package org.apache.nutch.parse.msexcel;
 
 // JDK imports
-import java.io.IOException;
 import java.io.InputStream;
-import java.util.Date;
-import java.util.Properties;
 
 // Jakarta POI imports
 import org.apache.poi.hssf.usermodel.HSSFCell;
 import org.apache.poi.hssf.usermodel.HSSFRow;
 import org.apache.poi.hssf.usermodel.HSSFSheet;
 import

svn commit: r377501 - in /lucene/nutch/trunk: ./ src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/ src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/

2006-02-13 Thread jerome
Author: jerome
Date: Mon Feb 13 13:43:15 2006
New Revision: 377501

URL: http://svn.apache.org/viewcvs?rev=377501view=rev
Log:
Javadoc updates for ms parsers

Added:

lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/package.html
   (with props)
Modified:
lucene/nutch/trunk/build.xml
lucene/nutch/trunk/default.properties

lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java

lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=377501r1=377500r2=377501view=diff
==
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Mon Feb 13 13:43:15 2006
@@ -249,6 +249,7 @@
 
packageset dir=${src.dir}/
packageset dir=${plugins.dir}/lib-http/src/java/
+   packageset dir=${plugins.dir}/lib-parsems/src/java/
packageset dir=${plugins.dir}/ontology/src/java/
packageset dir=${plugins.dir}/protocol-file/src/java/
packageset dir=${plugins.dir}/protocol-ftp/src/java/

Modified: lucene/nutch/trunk/default.properties
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/default.properties?rev=377501r1=377500r2=377501view=diff
==
--- lucene/nutch/trunk/default.properties (original)
+++ lucene/nutch/trunk/default.properties Mon Feb 13 13:43:15 2006
@@ -68,6 +68,7 @@
 plugin.msword=org.apache.nutch.parse.msword*
 # Unfortunately, ontology on core and plugin uses the same package:
 # plugin.ontology=org.apache.nutch.ontology*
+plugin.parsems=org.apache.nutch.parse.ms*
 plugin.pdf=org.apache.nutch.parse.pdf*
 plugin.rss=org.apache.nutch.parse.rss*
 plugin.rtf=org.apache.nutch.parse.rtf*
@@ -95,6 +96,7 @@
${plugin.msexcel}:\
${plugin.mspowerpoint}:\
${plugin.msword}:\
+   ${plugin.parsems}:\
${plugin.pdf}:\
${plugin.rss}:\
${plugin.rtf}:\

Modified: 
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java?rev=377501r1=377500r2=377501view=diff
==
--- 
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java
 Mon Feb 13 13:43:15 2006
@@ -56,7 +56,7 @@
 
   /**
* Parses a Content with a specific [EMAIL PROTECTED] MSExtractor Microsoft 
document
-   * extractor.
+   * extractor}.
*/
   protected Parse getParse(MSExtractor extractor, Content content) {
 

Added: 
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/package.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/package.html?rev=377501view=auto
==
--- 
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/package.html
 (added)
+++ 
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/package.html
 Mon Feb 13 13:43:15 2006
@@ -0,0 +1,5 @@
+html
+body
+pCommon API for Microsoft copy; documents parsing./p
+/body
+/html

Propchange: 
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/package.html
--
svn:eol-style = native

Modified: 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java?rev=377501r1=377500r2=377501view=diff
==
--- 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java
 Mon Feb 13 13:43:15 2006
@@ -23,7 +23,6 @@
  * 
  * @author Stephan Strittmatter - http://www.sybit.de
  * @version 1.0
- * @create 19.01.2005
  */
 public class FilteredStringWriter extends StringWriter {
 
@@ -67,4 +66,4 @@
   super.write(ch);
 }
   }
-}
\ No newline at end of file
+}




svn commit: r376966 - /lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java

2006-02-11 Thread jerome
Author: jerome
Date: Sat Feb 11 02:56:14 2006
New Revision: 376966

URL: http://svn.apache.org/viewcvs?rev=376966view=rev
Log:
Fix parse-msexcel unit tests

Modified:

lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java

Modified: 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java?rev=376966r1=376965r2=376966view=diff
==
--- 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
 Sat Feb 11 02:56:14 2006
@@ -4,18 +4,25 @@
  */
 package org.apache.nutch.parse.msexcel;
 
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ProtocolException;
-
-import org.apache.nutch.parse.ParserFactory;
-import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.util.NutchConfiguration;
 
+// JUnit imports
 import junit.framework.TestCase;
 
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.UTF8;
+
+
 /** 
  * Based on Unit tests for MSWordParser by John Xing
  *
@@ -31,31 +38,32 @@
   
   private String[] sampleFiles = {test.xls};
 
-  private String expectedText = BitStream test.xls 321654.0 Apache incubator 
1234.0 Doug Cutting 89078.0 CS 599 Search Engines Spring 2005.0 SBC 1234.0 
764893.0 Java NUTCH!! ;
+  private String expectedText = BitStream test.xls 321654.0 Apache  +
+incubator 1234.0 Doug Cutting 89078.0  +
+CS 599 Search Engines Spring 2005.0 SBC  +
+1234.0 764893.0 Java NUTCH!! ;
 
   public TestMSExcelParser(String name) { 
-super(name); 
+super(name);
   }
 
-  protected void setUp() {}
-
-  protected void tearDown() {}
-
   public void testIt() throws ProtocolException, ParseException {
+
 String urlString;
 Protocol protocol;
 Content content;
-Parser parser;
 Parse parse;
 
+Configuration conf = NutchConfiguration.create();
+ParseUtil parser = new ParseUtil(conf);
+ProtocolFactory factory = new ProtocolFactory(conf);
 for (int i = 0; i  sampleFiles.length; i++) {
   urlString = file: + sampleDir + fileSeparator + sampleFiles[i];
 
-  protocol = ProtocolFactory.getProtocol(urlString);
-  content = protocol.getContent(urlString);
-
-  parser = ParserFactory.getParser(content.getContentType(), urlString);
-  parse = parser.getParse(content);
+  protocol = factory.getProtocol(urlString);
+  content = protocol.getProtocolOutput(new UTF8(urlString),
+   new CrawlDatum()).getContent();
+  parse = parser.parseByParserId(parse-msexcel, content);
 
   assertTrue(parse.getText().equals(expectedText));
 }




svn commit: r376638 - in /lucene/nutch/trunk/src/plugin/parse-msword: lib/poi-2.1-20040508.jar lib/poi-scratchpad-2.1-20040508.jar plugin.xml

2006-02-10 Thread jerome
Author: jerome
Date: Fri Feb 10 03:24:37 2006
New Revision: 376638

URL: http://svn.apache.org/viewcvs?rev=376638view=rev
Log:
Remove no more used POI libs

Removed:
lucene/nutch/trunk/src/plugin/parse-msword/lib/poi-2.1-20040508.jar

lucene/nutch/trunk/src/plugin/parse-msword/lib/poi-scratchpad-2.1-20040508.jar
Modified:
lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml

Modified: lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml?rev=376638r1=376637r2=376638view=diff
==
--- lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml Fri Feb 10 03:24:37 
2006
@@ -9,8 +9,6 @@
   library name=parse-msword.jar
  export name=*/
   /library
-  library name=poi-2.1-20040508.jar/
-  library name=poi-scratchpad-2.1-20040508.jar/
/runtime
 
requires




svn commit: r376315 - /lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java

2006-02-09 Thread jerome
Author: jerome
Date: Thu Feb  9 07:14:17 2006
New Revision: 376315

URL: http://svn.apache.org/viewcvs?rev=376315view=rev
Log:
Add a default constructor to avoid failure when JobConf tries to instanciate 
ParseSegment on parse command.
(Just a workaround for HADOOP-29 issue).

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=376315r1=376314r2=376315view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Thu 
Feb  9 07:14:17 2006
@@ -35,7 +35,11 @@
 
   public static final Logger LOG =
 LogFormatter.getLogger(Parser.class.getName());
-
+  
+  public ParseSegment() {
+this(null);
+  }
+  
   public ParseSegment(Configuration conf) {
 super(conf);
   }




svn commit: r376478 - /lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

2006-02-09 Thread jerome
Author: jerome
Date: Thu Feb  9 15:12:59 2006
New Revision: 376478

URL: http://svn.apache.org/viewcvs?rev=376478view=rev
Log:
Ensure ParseData content metadata contains segment name and score

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=376478r1=376477r2=376478view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Feb  
9 15:12:59 2006
@@ -235,6 +235,9 @@
 byte[] signature = 
SignatureFactory.getSignature(getConf()).calculate(content, parse);
 metadata.set(SIGNATURE_KEY, StringUtil.toHexString(signature));
 datum.setSignature(signature);
+// Ensure segment name and score are in parseData metadata
+parse.getData().getContentMeta().set(SEGMENT_NAME_KEY, segmentName);   
 
+parse.getData().getContentMeta().set(SCORE_KEY, 
Float.toString(datum.getScore()));
   }
 
   try {




svn commit: r376522 - /lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

2006-02-09 Thread jerome
Author: jerome
Date: Thu Feb  9 17:08:15 2006
New Revision: 376522

URL: http://svn.apache.org/viewcvs?rev=376522view=rev
Log:
Ensure signature added in content metadata

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=376522r1=376521r2=376522view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Feb  
9 17:08:15 2006
@@ -237,7 +237,8 @@
 datum.setSignature(signature);
 // Ensure segment name and score are in parseData metadata
 parse.getData().getContentMeta().set(SEGMENT_NAME_KEY, segmentName);   
 
-parse.getData().getContentMeta().set(SCORE_KEY, 
Float.toString(datum.getScore()));
+parse.getData().getContentMeta().set(SCORE_KEY, 
Float.toString(datum.getScore()));
+parse.getData().getContentMeta().set(SIGNATURE_KEY, 
StringUtil.toHexString(signature));
   }
 
   try {




svn commit: r375965 - in /lucene/nutch/trunk: build.xml src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html

2006-02-08 Thread jerome
Author: jerome
Date: Wed Feb  8 05:58:08 2006
New Revision: 375965

URL: http://svn.apache.org/viewcvs?rev=375965view=rev
Log:
Fix some javadoc issues with lib-http plugin

Added:

lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html
   (with props)
Modified:
lucene/nutch/trunk/build.xml

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=375965r1=375964r2=375965view=diff
==
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Wed Feb  8 05:58:08 2006
@@ -223,6 +223,7 @@
   bottom=Copyright amp;copy; ${year} The Apache Software Foundation
   
packageset dir=${src.dir}/
+   packageset dir=${plugins.dir}/lib-http/src/java/
packageset dir=${plugins.dir}/protocol-file/src/java/
packageset dir=${plugins.dir}/protocol-ftp/src/java/
packageset dir=${plugins.dir}/protocol-http/src/java/

Added: 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html?rev=375965view=auto
==
--- 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html
 (added)
+++ 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html
 Wed Feb  8 05:58:08 2006
@@ -0,0 +1,6 @@
+html
+body
+pCommon API used by HTTP plugins ([EMAIL PROTECTED] 
org.apache.nutch.protocol.http http},
[EMAIL PROTECTED] org.apache.nutch.protocol.httpclient httpclient})/p
+/body
+/html

Propchange: 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html
--
svn:eol-style = native




svn commit: r375984 - in /lucene/nutch/trunk/src: java/org/apache/nutch/parse/ java/org/apache/nutch/util/ plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/

2006-02-08 Thread jerome
Author: jerome
Date: Wed Feb  8 07:42:44 2006
New Revision: 375984

URL: http://svn.apache.org/viewcvs?rev=375984view=rev
Log:
Fix some javadoc errors and warnings

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java

lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java?rev=375984r1=375983r2=375984view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java 
Wed Feb  8 07:42:44 2006
@@ -64,9 +64,9 @@
   
   /**
* Reads the codeparse-plugins.xml/code file and returns the
-   * [EMAIL PROTECTED] ParsePluginPreferenceList} defined by it.
+   * [EMAIL PROTECTED] ParsePluginList} defined by it.
*
-   * @return A [EMAIL PROTECTED] ParsePluginPreferenceList} specified by the
+   * @return A [EMAIL PROTECTED] ParsePluginList} specified by the
* codeparse-plugins.xml/code file.
* @throws Exception
* If any parsing error occurs.

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java?rev=375984r1=375983r2=375984view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java Wed Feb  
8 07:42:44 2006
@@ -51,7 +51,7 @@
   }
   
   /**
-   * Performs a parse by iterating through a List of preferred [EMAIL 
PROTECTED]
+   * Performs a parse by iterating through a List of preferred [EMAIL 
PROTECTED] Parser}s
* until a successful parse is performed and a [EMAIL PROTECTED] Parse} 
object is
* returned. If the parse is unsuccessful, a message is logged to the
* codeWARNING/code level, and an empty parse is returned.

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java?rev=375984r1=375983r2=375984view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Wed 
Feb  8 07:42:44 2006
@@ -106,13 +106,13 @@
*
* The function consults the internal [EMAIL PROTECTED] ParsePluginList} for 
the
* ParserFactory to determine the list of pluginIds, then gets the
-   * appropriate extension points to instantiate as {Parser}s.
+   * appropriate extension points to instantiate as [EMAIL PROTECTED] Parser}s.
*
* @param contentType The contentType to return the codeArray/code
-   *of {Parser}s for.
+   *of [EMAIL PROTECTED] Parser}s for.
* @param url The url for the content that may allow us to get the type from
*the file suffix.
-   * @return An codeArray/code of [EMAIL PROTECTED] for the given 
contentType.
+   * @return An codeArray/code of [EMAIL PROTECTED] Parser}s for the given 
contentType.
* If there were plugins mapped to a contentType via the
* codeparse-plugins.xml/code file, but never enabled via
* the codeplugin.includes/code Nutch conf, then those plugins

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java?rev=375984r1=375983r2=375984view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java Wed Feb  
8 07:42:44 2006
@@ -57,7 +57,6 @@
* Convenience call for [EMAIL PROTECTED] #toHexString(byte[], String, 
int)}, where
* codesep = null; lineLen = Integer.MAX_VALUE/code.
* @param buf
-   * @return
*/
   public static String toHexString(byte[] buf) {
 return toHexString(buf, null, Integer.MAX_VALUE);

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src

svn commit: r376012 - in /lucene/nutch/trunk: build.xml default.properties

2006-02-08 Thread jerome
Author: jerome
Date: Wed Feb  8 10:03:01 2006
New Revision: 376012

URL: http://svn.apache.org/viewcvs?rev=376012view=rev
Log:
Add/Move some plugins javadoc to the Plugins group

Modified:
lucene/nutch/trunk/build.xml
lucene/nutch/trunk/default.properties

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=376012r1=376011r2=376012view=diff
==
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Wed Feb  8 10:03:01 2006
@@ -227,20 +227,28 @@
 
packageset dir=${src.dir}/
packageset dir=${plugins.dir}/lib-http/src/java/
+   packageset dir=${plugins.dir}/ontology/src/java/
packageset dir=${plugins.dir}/protocol-file/src/java/
packageset dir=${plugins.dir}/protocol-ftp/src/java/
packageset dir=${plugins.dir}/protocol-http/src/java/
packageset dir=${plugins.dir}/protocol-httpclient/src/java/
+   packageset dir=${plugins.dir}/parse-ext/src/java/
packageset dir=${plugins.dir}/parse-html/src/java/
packageset dir=${plugins.dir}/parse-js/src/java/
packageset dir=${plugins.dir}/parse-text/src/java/
packageset dir=${plugins.dir}/parse-pdf/src/java/
 !--   packageset dir=${plugins.dir}/parse-rtf/src/java/ plugin excluded 
from build due to licensing issues--
 !--   packageset dir=${plugins.dir}/parse-mp3/src/java/ plugin excluded 
from build due to licensing issues--
+   packageset dir=${plugins.dir}/parse-mspowerpoint/src/java/
packageset dir=${plugins.dir}/parse-msword/src/java/
+   packageset dir=${plugins.dir}/parse-rss/src/java/
+   packageset dir=${plugins.dir}/parse-swf/src/java/
+   packageset dir=${plugins.dir}/parse-zip/src/java/
packageset dir=${plugins.dir}/index-basic/src/java/
packageset dir=${plugins.dir}/index-more/src/java/
packageset dir=${plugins.dir}/query-more/src/java/
+   packageset dir=${plugins.dir}/query-site/src/java/
+   packageset dir=${plugins.dir}/query-url/src/java/
packageset dir=${plugins.dir}/urlfilter-regex/src/java/
packageset dir=${plugins.dir}/urlfilter-prefix/src/java/
packageset dir=${plugins.dir}/creativecommons/src/java/

Modified: lucene/nutch/trunk/default.properties
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/default.properties?rev=376012r1=376011r2=376012view=diff
==
--- lucene/nutch/trunk/default.properties (original)
+++ lucene/nutch/trunk/default.properties Wed Feb  8 10:03:01 2006
@@ -33,8 +33,6 @@
 src.webapps = ./src/webapps
 
 # Proxy Host and Port to use for building JavaDoc
-#javadoc.proxy.host=-J-DproxyHost=
-#javadoc.proxy.port=-J-DproxyPort=
 javadoc.proxy.host=-J-DproxyHost=
 javadoc.proxy.port=-J-DproxyPort=
 javadoc.link.java=http://java.sun.com/j2se/1.4.2/docs/api/
@@ -49,21 +47,57 @@
 javac.deprecation=off
 javac.version= 1.4
 
-plugin.http=org.apache.nutch.protocol.http*
-plugin.httpclient=org.apache.nutch.protocol.httpclient*
-plugin.ftp=org.apache.nutch.protocol.ftp*
+# The list of packages assigned to plugins group in javadoc
+# (please keep this list ordered)
+plugin.basic=org.apache.nutch.indexer.basic*
+plugin.carrot2=org.apache.nutch.clustering.carrot2*
+plugin.creative=org.creativecommons.nutch*
+plugin.ext=org.apache.nutch.parse.ext*
 plugin.file=org.apache.nutch.protocol.file*
+plugin.ftp=org.apache.nutch.protocol.ftp*
 plugin.html=org.apache.nutch.parse.html*
+plugin.http=org.apache.nutch.protocol.http*
+plugin.httpclient=org.apache.nutch.protocol.httpclient*
 plugin.js=org.apache.nutch.parse.js*
+plugin.language=org.apache.nutch.analysis.lang*
+plugin.libhttp=org.apache.nutch.protocol.http.api*
+plugin.more=org.apache.nutch.indexer.more*:org.apache.nutch.searcher.more*
 plugin.mp3=org.apache.nutch.parse.mp3*
+plugin.mspowerpoint=org.apache.nutch.parse.mspowerpoint*
 plugin.msword=org.apache.nutch.parse.msword*
-plugin.rtf=org.apache.nutch.parse.rtf*
+# Unfortunately, ontology on core and plugin uses the same package:
+# plugin.ontology=org.apache.nutch.ontology*
 plugin.pdf=org.apache.nutch.parse.pdf*
+plugin.rss=org.apache.nutch.parse.rss*
+plugin.rtf=org.apache.nutch.parse.rtf*
+plugin.site=org.apache.nutch.searcher.site*
+plugin.swf=org.apache.nutch.parse.swf*
 plugin.text=org.apache.nutch.parse.text*
-plugin.basic=org.apache.nutch.indexer.basic*
-plugin.more=org.apache.nutch.indexer.more*
-plugin.language=org.apache.nutch.analysis.lang*
-plugin.creative=org.creativecommons.nutch*
-plugins.packages=${plugin.http}:${plugin.httpclient}:${plugin.ftp}:${plugin.file}:${plugin.html}:${plugin.js}:${plugin.mp3}:\
-   
${plugin.msword}:${plugin.rtf}:${plugin.pdf}:${plugin.text}:${plugin.basic}:${plugin.more}:\
-   ${plugin.language}:${plugin.creative}
+plugin.url=org.apache.nutch.searcher.url*
+plugin.zip

svn commit: r368172 - in /lucene/nutch/trunk: docs/ca/ docs/de/ docs/en/ docs/es/ docs/fi/ docs/fr/ docs/hu/ docs/jp/ docs/ms/ docs/nl/ docs/pl/ docs/pt/ docs/sv/ docs/th/ docs/zh/ src/java/org/apache

2006-01-11 Thread jerome
Author: jerome
Date: Wed Jan 11 15:50:13 2006
New Revision: 368172

URL: http://svn.apache.org/viewcvs?rev=368172view=rev
Log:
Add a style for summary highlight and ellipsis

Modified:
lucene/nutch/trunk/docs/ca/about.html
lucene/nutch/trunk/docs/ca/help.html
lucene/nutch/trunk/docs/ca/search.html
lucene/nutch/trunk/docs/de/about.html
lucene/nutch/trunk/docs/de/help.html
lucene/nutch/trunk/docs/de/search.html
lucene/nutch/trunk/docs/en/about.html
lucene/nutch/trunk/docs/en/help.html
lucene/nutch/trunk/docs/en/search.html
lucene/nutch/trunk/docs/es/about.html
lucene/nutch/trunk/docs/es/help.html
lucene/nutch/trunk/docs/es/search.html
lucene/nutch/trunk/docs/fi/about.html
lucene/nutch/trunk/docs/fi/help.html
lucene/nutch/trunk/docs/fi/search.html
lucene/nutch/trunk/docs/fr/about.html
lucene/nutch/trunk/docs/fr/search.html
lucene/nutch/trunk/docs/hu/about.html
lucene/nutch/trunk/docs/hu/help.html
lucene/nutch/trunk/docs/hu/search.html
lucene/nutch/trunk/docs/jp/about.html
lucene/nutch/trunk/docs/jp/help.html
lucene/nutch/trunk/docs/jp/search.html
lucene/nutch/trunk/docs/ms/about.html
lucene/nutch/trunk/docs/ms/help.html
lucene/nutch/trunk/docs/ms/search.html
lucene/nutch/trunk/docs/nl/about.html
lucene/nutch/trunk/docs/nl/help.html
lucene/nutch/trunk/docs/nl/search.html
lucene/nutch/trunk/docs/pl/about.html
lucene/nutch/trunk/docs/pl/help.html
lucene/nutch/trunk/docs/pl/search.html
lucene/nutch/trunk/docs/pt/about.html
lucene/nutch/trunk/docs/pt/help.html
lucene/nutch/trunk/docs/pt/search.html
lucene/nutch/trunk/docs/sv/about.html
lucene/nutch/trunk/docs/sv/help.html
lucene/nutch/trunk/docs/sv/search.html
lucene/nutch/trunk/docs/th/about.html
lucene/nutch/trunk/docs/th/help.html
lucene/nutch/trunk/docs/th/search.html
lucene/nutch/trunk/docs/zh/about.html
lucene/nutch/trunk/docs/zh/help.html
lucene/nutch/trunk/docs/zh/search.html
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summary.java
lucene/nutch/trunk/src/web/include/style.html

Modified: lucene/nutch/trunk/docs/ca/about.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/docs/ca/about.html?rev=368172r1=368171r2=368172view=diff
==
--- lucene/nutch/trunk/docs/ca/about.html (original)
+++ lucene/nutch/trunk/docs/ca/about.html Wed Jan 11 15:50:13 2006
@@ -16,6 +16,8 @@
 h3 {font-family: Arial, Helvetica, sans-serif; font-size: 16px; color: 
#00;}
 h4 {font-family: Arial, Helvetica, sans-serif; font-size: 14px; color: 
#00;}
 .url {color: #996600;}
+.highlight {font-weight: bold;}
+.ellipsis {font-weight: bold;}
 /style
 link type=image/x-icon href=../img/favicon.ico rel=icon
 link type=image/x-icon href=../img/favicon.ico rel=shortcut icon

Modified: lucene/nutch/trunk/docs/ca/help.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/docs/ca/help.html?rev=368172r1=368171r2=368172view=diff
==
--- lucene/nutch/trunk/docs/ca/help.html (original)
+++ lucene/nutch/trunk/docs/ca/help.html Wed Jan 11 15:50:13 2006
@@ -16,6 +16,8 @@
 h3 {font-family: Arial, Helvetica, sans-serif; font-size: 16px; color: 
#00;}
 h4 {font-family: Arial, Helvetica, sans-serif; font-size: 14px; color: 
#00;}
 .url {color: #996600;}
+.highlight {font-weight: bold;}
+.ellipsis {font-weight: bold;}
 /style
 link type=image/x-icon href=../img/favicon.ico rel=icon
 link type=image/x-icon href=../img/favicon.ico rel=shortcut icon

Modified: lucene/nutch/trunk/docs/ca/search.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/docs/ca/search.html?rev=368172r1=368171r2=368172view=diff
==
--- lucene/nutch/trunk/docs/ca/search.html (original)
+++ lucene/nutch/trunk/docs/ca/search.html Wed Jan 11 15:50:13 2006
@@ -16,6 +16,8 @@
 h3 {font-family: Arial, Helvetica, sans-serif; font-size: 16px; color: 
#00;}
 h4 {font-family: Arial, Helvetica, sans-serif; font-size: 14px; color: 
#00;}
 .url {color: #996600;}
+.highlight {font-weight: bold;}
+.ellipsis {font-weight: bold;}
 /style
 link type=image/x-icon href=../img/favicon.ico rel=icon
 link type=image/x-icon href=../img/favicon.ico rel=shortcut icon

Modified: lucene/nutch/trunk/docs/de/about.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/docs/de/about.html?rev=368172r1=368171r2=368172view=diff
==
--- lucene/nutch/trunk/docs/de/about.html (original)
+++ lucene/nutch/trunk/docs/de/about.html Wed Jan 11 15:50:13 2006
@@ -16,6 +16,8 @@
 h3 {font-family: Arial, Helvetica, sans-serif; font-size: 16px; color: 
#00;}
 h4 {font-family: Arial, Helvetica, sans-serif; font-size: 14px; color: 
#00;}
 .url

svn commit: r367405 - /lucene/nutch/trunk/src/plugin/build.xml

2006-01-09 Thread jerome
Author: jerome
Date: Mon Jan  9 13:45:25 2006
New Revision: 367405

URL: http://svn.apache.org/viewcvs?rev=367405view=rev
Log:
Remove deployment of analysis plugins (under dev)
Remove protocol-http unit tests (moved to lib-http)

Modified:
lucene/nutch/trunk/src/plugin/build.xml

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=367405r1=367404r2=367405view=diff
==
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Mon Jan  9 13:45:25 2006
@@ -6,8 +6,6 @@
   !-- Build  deploy all the plugin jars.--
   !-- == --
   target name=deploy
- ant dir=analysis-de target=deploy/
- ant dir=analysis-fr target=deploy/
  ant dir=clustering-carrot2 target=deploy/
  ant dir=creativecommons target=deploy/
  ant dir=index-basic target=deploy/
@@ -47,8 +45,8 @@
   target name=test
  ant dir=creativecommons target=test/
  ant dir=languageidentifier target=test/
+ ant dir=lib-http target=test/
  ant dir=ontology target=test/
- ant dir=protocol-http target=test/
  ant dir=parse-ext target=test/
  ant dir=parse-html target=test/
  !-- ant dir=parse-mp3 target=test/ --
@@ -71,6 +69,7 @@
 ant dir=index-basic target=clean/
 ant dir=index-more target=clean/
 ant dir=languageidentifier target=clean/
+ant dir=lib-http target=clean/
 ant dir=lib-jakarta-poi target=clean/
 ant dir=lib-lucene-analyzers target=clean/
 ant dir=nutch-extensionpoints target=clean/




svn commit: r357334 - in /lucene/nutch/trunk: conf/nutch-default.xml src/java/org/apache/nutch/protocol/Content.java src/java/org/apache/nutch/protocol/ContentProperties.java

2005-12-17 Thread jerome
Author: jerome
Date: Sat Dec 17 02:06:31 2005
New Revision: 357334

URL: http://svn.apache.org/viewcvs?rev=357334view=rev
Log:
NUTCH-3, ContentProperties can handle multivalued properties (S. Groschupf)

Modified:
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=357334r1=357333r2=357334view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Sat Dec 17 02:06:31 2005
@@ -105,7 +105,7 @@
 
 property
   namehttp.redirect.max/name
-  value3/value
+  value10/value
   descriptionThe maximum number of redirects the fetcher will follow when
 trying to fetch a page./description
 /property
@@ -727,7 +727,8 @@
 
 property
   nameparser.character.encoding.default/name
-  valuewindows-1252/value
+  !--valuewindows-1252/value--
+  valueutf-8/value
   descriptionThe character encoding to fall back to when no other information
   is available/description
 /property

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?rev=357334r1=357333r2=357334view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Sat Dec 
17 02:06:31 2005
@@ -77,11 +77,8 @@
 
 contentType = UTF8.readString(in);// read contentType
 
-int propertyCount = in.readInt(); // read metadata
 metadata = new ContentProperties();
-for (int i = 0; i  propertyCount; i++) {
-  metadata.put(UTF8.readString(in), UTF8.readString(in));
-}
+metadata.readFields(in);// read meta data
   }
 
   protected final void writeCompressed(DataOutput out) throws IOException {
@@ -95,13 +92,7 @@
 
 UTF8.writeString(out, contentType);   // write contentType
 
-out.writeInt(metadata.size());// write metadata
-Iterator i = metadata.entrySet().iterator();
-while (i.hasNext()) {
-  Map.Entry e = (Map.Entry)i.next();
-  UTF8.writeString(out, (String)e.getKey());
-  UTF8.writeString(out, (String)e.getValue());
-}
+metadata.write(out);   // write metadata
   }
 
   public static Content read(DataInput in) throws IOException {

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java?rev=357334r1=357333r2=357334view=diff
==
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java 
(original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java 
Sat Dec 17 02:06:31 2005
@@ -16,15 +16,22 @@
 
 package org.apache.nutch.protocol;
 
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Enumeration;
 import java.util.Iterator;
 import java.util.Properties;
 import java.util.TreeMap;
 
+import org.apache.nutch.io.UTF8;
+import org.apache.nutch.io.Writable;
+
 /**
- * case insensitive properties
+ * writable case insensitive properties
  */
-public class ContentProperties extends TreeMap {
+public class ContentProperties extends TreeMap implements Writable {
 
 /**
  * construct the TreeMap with a case insensitive comparator
@@ -51,6 +58,36 @@
 return (String) get(key);
 }
 
+/*
+ * (non-Javadoc)
+ * 
+ * @see java.util.Map#get(java.lang.Object)
+ */
+public Object get(Object arg0) {
+Object object = super.get(arg0);
+if (object != null  object instanceof ArrayList) {
+ArrayList list = (ArrayList) object;
+return list.get(list.size() - 1);
+}
+return object;
+}
+
+/**
+ * @param key
+ * @return the properties as a string array if there is no such property we
+ * retunr a array with 0 entries
+ */
+public String[] getProperties(String key) {
+Object object = super.get(key);
+if (object != null  !(object instanceof ArrayList)) {
+return new String[] { (String) object };
+} else if (object != null  object instanceof ArrayList) {
+ArrayList list = (ArrayList) object;
+return (String[]) list.toArray(new String[list.size()]);
+}
+return new String[0

svn commit: r355809 - in /lucene/nutch/trunk/src: java/org/apache/nutch/protocol/ java/org/apache/nutch/util/mime/ plugin/protocol-file/src/java/org/apache/nutch/protocol/file/ plugin/protocol-ftp/src

2005-12-10 Thread jerome
Author: jerome
Date: Sat Dec 10 15:47:18 2005
New Revision: 355809

URL: http://svn.apache.org/viewcvs?rev=355809view=rev
Log:
Content-Type resolution enhancements:
* Resolution moved from protocol plugins to Content constructor
* Best content-type guessing policy
* Some unit tests added

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeTypes.java

lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java

lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java

lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java

lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?rev=355809r1=355808r2=355809view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Sat Dec 
10 15:47:18 2005
@@ -22,6 +22,10 @@
 import org.apache.nutch.io.*;
 import org.apache.nutch.fs.*;
 import org.apache.nutch.util.*;
+import org.apache.nutch.util.mime.MimeType;
+import org.apache.nutch.util.mime.MimeTypes;
+import org.apache.nutch.util.mime.MimeTypeException;
+
 
 public final class Content extends VersionedWritable {
 
@@ -29,6 +33,14 @@
 
   private final static byte VERSION = 1;
 
+  /** A flag that tells if magic resolution must be performed */
+  private final static boolean MAGIC =
+NutchConf.get().getBoolean(mime.type.magic, true);
+
+  /** Get the MimeTypes resolver instance. */
+  private final static MimeTypes MIME = 
+MimeTypes.get(NutchConf.get().get(mime.types.file));
+
   private String url;
   private String base;
   private byte[] content;
@@ -38,18 +50,17 @@
   public Content() {}
 
   public Content(String url, String base, byte[] content, String contentType,
- Properties metadata){
+ Properties metadata) {
 
 if (url == null) throw new IllegalArgumentException(null url);
 if (base == null) throw new IllegalArgumentException(null base);
 if (content == null) throw new IllegalArgumentException(null content);
-if (contentType == null) throw new IllegalArgumentException(null type);
 if (metadata == null) throw new IllegalArgumentException(null metadata);
 
 this.url = url;
 this.base = base;
 this.content = content;
-this.contentType = contentType;
+this.contentType = getContentType(contentType, url, content);
 this.metadata = metadata;
   }
 
@@ -185,4 +196,33 @@
   nfs.close();
 }
   }
+
+  private String getContentType(String typeName, String url, byte[] data) {
+
+MimeType type = null;
+try {
+typeName = MimeType.clean(typeName);
+type = typeName == null ? null : MIME.forName(typeName);
+} catch (MimeTypeException mte) {
+// Seems to be a malformed mime type name...
+}
+
+if (typeName == null || type == null || !type.matches(url)) {
+  // If no mime-type header, or cannot find a corresponding registered
+  // mime-type, or the one found doesn't match the url pattern
+  // it shouldbe, then guess a mime-type from the url pattern
+  type = MIME.getMimeType(url);
+  typeName = type == null ? typeName : type.getName();
+}
+if (typeName == null || type == null ||
+(MAGIC  type.hasMagic()  !type.matches(data))) {
+  // If no mime-type already found, or the one found doesn't match
+  // the magic bytes it should be, then, guess a mime-type from the
+  // document content (magic bytes)
+  type = MIME.getMimeType(data);
+  typeName = type == null ? typeName : type.getName();
+}
+return typeName;
+  }
+
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java?rev=355809r1=355808r2=355809view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java Sat 
Dec 10 15:47:18 2005
@@ -227,11 +227,21 @@
 return minLength;
 }
 
-boolean hasMagic() {
+public boolean hasMagic() {
 return (magics.size()  0);
 }
 
-boolean matches(byte[] data) {
+public boolean

svn commit: r355828 - in /lucene/nutch/trunk/src: java/org/apache/nutch/fetcher/ java/org/apache/nutch/parse/ java/org/apache/nutch/protocol/ java/org/apache/nutch/servlet/ java/org/apache/nutch/tools

2005-12-10 Thread jerome
Author: jerome
Date: Sat Dec 10 16:36:57 2005
New Revision: 355828

URL: http://svn.apache.org/viewcvs?rev=355828view=rev
Log:
NUTCH-135 : Content metadata are now case insensitive (thanks to S. Groschupf)

Added:

lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java   
(with props)
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java
lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java

lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java

lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java

lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java

lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java

lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java

lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java

lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java

lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java

lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java

lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java

lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java

lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java

lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java

lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java

lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java

lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java

lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java
lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java
lucene/nutch/trunk/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java
lucene/nutch/trunk/src/web/jsp/cached.jsp

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=355828r1=355827r2=355828view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Sat Dec 
10 16:36:57 2005
@@ -234,7 +234,7 @@
   MD5Hash hash = null;
   String url = fle.getPage().getURL().toString();
   if (content == null) {
-content = new Content(url, url, new byte[0], , new Properties());
+content = new Content(url, url, new byte[0], , new 
ContentProperties());
 hash = MD5Hash.digest(url);
   } else {
 hash = MD5Hash.digest(content.getContent());
@@ -263,7 +263,7 @@
 + status.toString());
 outputPage(new FetcherOutput(fle, hash, protocolStatus),
 content, new ParseText(),
-new ParseData(status, , new Outlink[0], new Properties()));
+new ParseData(status, , new Outlink[0], new 
ContentProperties()));
   }
   return status;
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java?rev=355828r1=355827r2=355828view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java Sat Dec 
10 16:36:57 2005
@@ -21,6 +21,7 @@
 
 import org.apache.nutch.io.*;
 import org.apache.nutch.fs.*;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.tools.UpdateDatabaseTool;
 
 
@@ -34,12 +35,12 @@
 
   private String title;
   private Outlink[] outlinks;
-  private Properties metadata;
+  private ContentProperties

svn commit: r354399 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/plugin/PluginRepository.java

2005-12-06 Thread jerome
Author: jerome
Date: Tue Dec  6 02:51:06 2005
New Revision: 354399

URL: http://svn.apache.org/viewcvs?rev=354399view=rev
Log:
Merge from trunk 354397:354398 - Improvements in plugin circular dependencies 
detection

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/plugin/PluginRepository.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/plugin/PluginRepository.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/plugin/PluginRepository.java?rev=354399r1=354398r2=354399view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/plugin/PluginRepository.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/plugin/PluginRepository.java
 Tue Dec  6 02:51:06 2005
@@ -116,13 +116,20 @@
 }
 
 private void getPluginCheckedDependencies(PluginDescriptor plugin,
- Map plugins, Map dependencies)
+ Map plugins,
+ Map dependencies,
+ Map branch)
   throws MissingDependencyException,
  CircularDependencyException {
   
   if (dependencies == null) { dependencies = new HashMap(); }
+  if (branch == null) { branch = new HashMap(); }
+  branch.put(plugin.getPluginId(), plugin);
   
+  // Get the plugin dependencies
   String[] ids = plugin.getDependencies();
+
+  // Otherwise, checks each dependency
   for (int i=0; iids.length; i++) {
 String id = ids[i];
 PluginDescriptor dependency = (PluginDescriptor) plugins.get(id);
@@ -131,15 +138,17 @@
   Missing dependency  + id +
for plugin  + plugin.getPluginId());
 }
-if (dependencies.containsKey(id)  
!id.equals(nutch-extensionpoints)) {
+if (branch.containsKey(id)) {
   throw new CircularDependencyException(
   Circular dependency detected  + id +
for plugin  + plugin.getPluginId());
 }
 dependencies.put(id, dependency);
 getPluginCheckedDependencies((PluginDescriptor) plugins.get(id),
- plugins, dependencies);
+ plugins, dependencies, branch);
   }
+  
+  branch.remove(plugin.getPluginId());
 }
 
 private Map getPluginCheckedDependencies(PluginDescriptor plugin,
@@ -147,7 +156,8 @@
   throws MissingDependencyException,
  CircularDependencyException {
   Map dependencies = new HashMap();
-  getPluginCheckedDependencies(plugin, plugins, dependencies);
+  Map branch = new HashMap();
+  getPluginCheckedDependencies(plugin, plugins, dependencies, branch);
   return dependencies;
 }
 




svn commit: r354575 - /lucene/nutch/trunk/src/web/jsp/cached.jsp

2005-12-06 Thread jerome
Author: jerome
Date: Tue Dec  6 13:42:25 2005
New Revision: 354575

URL: http://svn.apache.org/viewcvs?rev=354575view=rev
Log:
NUTCH-112, link to cached content changed to relative (C. Mattmann)

Modified:
lucene/nutch/trunk/src/web/jsp/cached.jsp

Modified: lucene/nutch/trunk/src/web/jsp/cached.jsp
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/web/jsp/cached.jsp?rev=354575r1=354574r2=354575view=diff
==
--- lucene/nutch/trunk/src/web/jsp/cached.jsp (original)
+++ lucene/nutch/trunk/src/web/jsp/cached.jsp Tue Dec  6 13:42:25 2005
@@ -74,6 +74,6 @@
 % } else { %
 
 The cached content has mime type %=contentType%,
-click this a href=/servlet/cached?%=id%link/a to download it directly.
+click this a href=./servlet/cached?%=id%link/a to download it directly.
 
 % } %




svn commit: r354582 - /lucene/nutch/branches/mapred/src/web/jsp/cached.jsp

2005-12-06 Thread jerome
Author: jerome
Date: Tue Dec  6 14:00:12 2005
New Revision: 354582

URL: http://svn.apache.org/viewcvs?rev=354582view=rev
Log:
NUTCH-112, merged from trunk

Modified:
lucene/nutch/branches/mapred/src/web/jsp/cached.jsp

Modified: lucene/nutch/branches/mapred/src/web/jsp/cached.jsp
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/web/jsp/cached.jsp?rev=354582r1=354581r2=354582view=diff
==
--- lucene/nutch/branches/mapred/src/web/jsp/cached.jsp (original)
+++ lucene/nutch/branches/mapred/src/web/jsp/cached.jsp Tue Dec  6 14:00:12 2005
@@ -74,6 +74,6 @@
 % } else { %
 
 The cached content has mime type %=contentType%,
-click this a href=/servlet/cached?%=id%link/a to download it directly.
+click this a href=./servlet/cached?%=id%link/a to download it directly.
 
 % } %




svn commit: r293370 - in /lucene/nutch/trunk/src: java/org/apache/nutch/parse/ParsePluginsReader.java test/org/apache/nutch/parse/TestParserFactory.java

2005-10-03 Thread jerome
Author: jerome
Date: Mon Oct  3 08:56:43 2005
New Revision: 293370

URL: http://svn.apache.org/viewcvs?rev=293370view=rev
Log:
Change the way the parse-plugin.xml file is loaded (MalformedURLException 
reported by Earl Cahill)

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java?rev=293370r1=293369r2=293370view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java 
Mon Oct  3 08:56:43 2005
@@ -54,14 +54,17 @@
   public static final Logger LOG = 
   LogFormatter.getLogger(ParsePluginsReader.class.getName());
   
+  /** The property name of the parse-plugins location */
+  private static final String PP_FILE_PROP = parse.plugin.file;
+
   /* the parse-plugins file */
-  private String fParsePluginsFile = parse-plugins.xml;
+  private String fParsePluginsFile = null;
+
   
   /**
* Constructs a new ParsePluginsReader
*/
-  public ParsePluginsReader() {
-  }
+  public ParsePluginsReader() { }
   
   /**
* Reads the codeparse-plugins.xml/code file and returns the
@@ -82,33 +85,22 @@
 Document document = null;
 InputSource inputSource = null;
 
-//check to see if the Nutch conf property
-//parse.plugin.file is defined
-String parsePluginFileUrl = NutchConf.get().get(parse.plugin.file);
-
 InputStream ppInputStream = null;
-
-if (parsePluginFileUrl != null) {
-URL parsePluginUrl = null;
-
-try {
-parsePluginUrl = new URL(parsePluginFileUrl);
-ppInputStream = parsePluginUrl.openStream();
-} catch (MalformedURLException e) {
-LOG.log(Level.SEVERE,
-Unable to load parse plugins file from URL [
-+ parsePluginFileUrl + ], e);
-return null;
-} catch (IOException e) {
-LOG.log(Level.SEVERE,
-Unable to load parse plugins file from URL [
-+ parsePluginFileUrl + ], e);
-return null;
-}
-} else {
-ppInputStream = NutchConf.get().getConfResourceAsInputStream(
-fParsePluginsFile);
-}
+if (fParsePluginsFile != null) {
+  URL parsePluginUrl = null;
+  try {
+parsePluginUrl = new URL(fParsePluginsFile);
+ppInputStream = parsePluginUrl.openStream();
+  } catch (Exception e) {
+LOG.log(Level.SEVERE,
+Unable to load parse plugins file from URL  +
+[ + fParsePluginsFile + ], e);
+return pList;
+  }
+} else {
+  ppInputStream = NutchConf.get().getConfResourceAsInputStream(
+  NutchConf.get().get(PP_FILE_PROP));
+}
 
 inputSource = new InputSource(ppInputStream);
 

Modified: 
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java?rev=293370r1=293369r2=293370view=diff
==
--- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java 
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java 
Mon Oct  3 08:56:43 2005
@@ -37,30 +37,13 @@
  */
 public class TestParserFactory extends TestCase {

-  private NutchConf conf = null;
   
   public TestParserFactory(String name) { super(name); }
-  
-  private void initNutchConf(String testFile) {
-// set the Nutch Conf property for parse.plugin.file.url
-// to ${test.src.dir}/org/apache/nutch/parse/parse-plugin-test.xml
-String testParsePluginFileUrl = null;
-try{
-  testParsePluginFileUrl = new File(System.getProperty(test.src.dir)
-  +/org/apache/nutch/parse/ + testFile).toURL().toString();
-  NutchConf.get().set(parse.plugin.file.url,testParsePluginFileUrl);
-  this.conf = NutchConf.get();
-}
-catch(MalformedURLException e){
-  throw new RuntimeException(Unable to load parse-plugins.xml file from 
URL: +testParsePluginFileUrl);
-}
-  }
-  
-  /** Inits the Test Case: loads the Nutch Conf instance. */
+
+  /** Inits the Test Case with the test parse-plugin file */
   protected void setUp() throws Exception {
-if (conf == null) {
-  initNutchConf(parse-plugin-test.xml);
-}
+NutchConf.get().set(parse.plugin.file,
+org/apache/nutch/parse/parse

svn commit: r292035 - in /lucene/nutch/trunk: conf/ src/java/org/apache/nutch/parse/ src/test/org/apache/nutch/parse/

2005-09-27 Thread jerome
Author: jerome
Date: Tue Sep 27 13:45:37 2005
New Revision: 292035

URL: http://svn.apache.org/viewcvs?rev=292035view=rev
Log:
NUTCH-88, First step proposal implementation (thanks to Chris Mattmann and 
Sébastien Le Callonnec)

Added:
lucene/nutch/trunk/conf/parse-plugins.dtd   (with props)
lucene/nutch/trunk/conf/parse-plugins.xml   (with props)
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java   
(with props)
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java  
 (with props)
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java   
(with props)
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java

Added: lucene/nutch/trunk/conf/parse-plugins.dtd
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.dtd?rev=292035view=auto
==
--- lucene/nutch/trunk/conf/parse-plugins.dtd (added)
+++ lucene/nutch/trunk/conf/parse-plugins.dtd Tue Sep 27 13:45:37 2005
@@ -0,0 +1,7 @@
+!ELEMENT parse-plugins (mimeType+)
+!ELEMENT mimeType (plugin+)
+!ATTLIST mimeType name CDATA #REQUIRED
+
+!ELEMENT plugin EMPTY
+!ATTLIST plugin id CDATA #REQUIRED
+!ATTLIST plugin order CDATA ''
\ No newline at end of file

Propchange: lucene/nutch/trunk/conf/parse-plugins.dtd
--
svn:eol-style = native

Added: lucene/nutch/trunk/conf/parse-plugins.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.xml?rev=292035view=auto
==
--- lucene/nutch/trunk/conf/parse-plugins.xml (added)
+++ lucene/nutch/trunk/conf/parse-plugins.xml Tue Sep 27 13:45:37 2005
@@ -0,0 +1,207 @@
+?xml version=1.0 encoding=UTF-8?
+!--
+   Copyright 2005 The Apache Software Foundation
+   
+   Licensed under the Apache License, Version 2.0 (the License);
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   
+   http://www.apache.org/licenses/LICENSE-2.0
+   
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an AS IS BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+   
+   Author : mattmann 
+   Description: This xml file represents a natural ordering for which 
parsing 
+   plugin should get called for a particular mimeType. 
+--
+
+parse-plugins
+
+   !--  by default if the mimeType is set to *, or 
+   can't be determined, use parse-text --
+   mimeType name=*
+   plugin id=parse-text /
+   /mimeType
+
+   mimeType name=application/java
+   plugin id=parse-text /
+   /mimeType
+
+   mimeType name=application/msword
+   plugin id=parse-msword /
+   /mimeType
+
+   mimeType name=application/pdf
+   plugin id=parse-pdf /
+   plugin id=parse-text /
+   /mimeType
+
+   mimeType name=application/postscript
+   plugin id=parse-pdf /
+   /mimeType
+
+   mimeType name=application/rss+xml
+   plugin id=parse-rss /
+   plugin id=parse-text /
+   /mimeType
+
+   mimeType name=application/vnd.ms-excel
+   plugin id=parse-msexcel /
+   /mimeType
+
+   mimeType name=application/vnd.ms-powerpoint
+   plugin id=parse-mspowerpoint /
+   /mimeType
+
+   mimeType name=application/vnd.wap.wbxml
+   plugin id=parse-text /
+   /mimeType
+
+   mimeType name=application/vnd.wap.wmlc
+   plugin id=parse-text /
+   /mimeType
+
+   mimeType name=application/vnd.wap.wmlscriptc
+   plugin id=parse-text /
+   /mimeType
+
+   mimeType name=application/xhtml+xml
+   plugin id=parse-text /
+   /mimeType
+
+   mimeType name=application/x-bzip2
+   !--  try and parse it with the zip parser --
+   plugin id=parse-zip /
+   /mimeType
+
+   mimeType name=application/x-csh
+   plugin id=parse-text /
+   /mimeType
+
+   mimeType name=application/x-gzip
+   !--  try and parse it with the zip parser --
+   plugin id=parse-zip /
+   /mimeType
+
+   mimeType name=application/x-javascript
+   plugin id=parse-js /
+   plugin id=parse-text /
+   /mimeType
+
+   mimeType name=application/x-kword
+   !--  try and parse it with the word parser --
+   plugin id=parse-msword /
+   /mimeType
+
+   mimeType name=application/x-kspread
+   !--  try and parse

svn commit: r280549 - /lucene/nutch/trunk/src/plugin/build.xml

2005-09-13 Thread jerome
Author: jerome
Date: Tue Sep 13 05:52:13 2005
New Revision: 280549

URL: http://svn.apache.org/viewcvs?rev=280549view=rev
Log:
Sorted alphabetically for easy maintenance

Modified:
lucene/nutch/trunk/src/plugin/build.xml

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=280549r1=280548r2=280549view=diff
==
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Tue Sep 13 05:52:13 2005
@@ -6,89 +6,89 @@
   !-- Build  deploy all the plugin jars.--
   !-- == --
   target name=deploy
+ ant dir=clustering-carrot2 target=deploy/
+ ant dir=creativecommons target=deploy/
+ ant dir=index-basic target=deploy/
+ ant dir=index-more target=deploy/
+ ant dir=languageidentifier target=deploy/
  ant dir=lib-jakarta-poi target=deploy/
  ant dir=nutch-extensionpoints target=deploy/
+ ant dir=ontology target=deploy/
  ant dir=protocol-file target=deploy/
  ant dir=protocol-ftp target=deploy/
  ant dir=protocol-http target=deploy/
  ant dir=protocol-httpclient target=deploy/
+ ant dir=parse-ext target=deploy/
  ant dir=parse-html target=deploy/
  ant dir=parse-js target=deploy/
- ant dir=parse-text target=deploy/
+ !-- ant dir=parse-mp3 target=deploy/ --
+ ant dir=parse-mspowerpoint target=deploy/
+ ant dir=parse-msword target=deploy/
  ant dir=parse-pdf target=deploy/
  ant dir=parse-rss target=deploy/
- ant dir=parse-msword target=deploy/
- ant dir=parse-mspowerpoint target=deploy/
-!-- ant dir=parse-mp3 target=deploy/ --
-!-- ant dir=parse-rtf target=deploy/ --
- ant dir=parse-ext target=deploy/
+ !-- ant dir=parse-rtf target=deploy/ --
+ ant dir=parse-text target=deploy/
  ant dir=parse-zip target=deploy/
- ant dir=index-basic target=deploy/
- ant dir=index-more target=deploy/
  ant dir=query-basic target=deploy/
  ant dir=query-more target=deploy/
  ant dir=query-site target=deploy/
  ant dir=query-url target=deploy/
- ant dir=urlfilter-regex target=deploy/
  ant dir=urlfilter-prefix target=deploy/
- ant dir=creativecommons target=deploy/
- ant dir=languageidentifier target=deploy/
- ant dir=clustering-carrot2 target=deploy/
- ant dir=ontology target=deploy/
+ ant dir=urlfilter-regex target=deploy/
   /target
 
   !-- == --
   !-- Test all of the plugins.   --
   !-- == --
   target name=test
+ ant dir=creativecommons target=test/
+ ant dir=languageidentifier target=test/
+ ant dir=ontology target=test/
  ant dir=protocol-http target=test/
+ ant dir=parse-ext target=test/
  ant dir=parse-html target=test/
+ !-- ant dir=parse-mp3 target=test/ --
+ ant dir=parse-mspowerpoint target=test/
+ ant dir=parse-msword target=test/
  ant dir=parse-pdf target=test/
  ant dir=parse-rss target=test/
- ant dir=parse-msword target=test/
- ant dir=parse-mspowerpoint target=test/
- !-- ant dir=parse-mp3 target=test/ --
  !-- ant dir=parse-rtf target=test/ --
- ant dir=parse-ext target=test/
  ant dir=parse-zip target=test/
- ant dir=creativecommons target=test/
- ant dir=languageidentifier target=test/
- ant dir=ontology target=test/
   /target
 
   !-- == --
   !-- Clean all of the plugins.  --
   !-- == --
   target name=clean
+ant dir=clustering-carrot2 target=clean/
+ant dir=creativecommons target=clean/
+ant dir=index-basic target=clean/
+ant dir=index-more target=clean/
+ant dir=languageidentifier target=clean/
 ant dir=lib-jakarta-poi target=clean/
 ant dir=nutch-extensionpoints target=clean/
+ant dir=ontology target=clean/
 ant dir=protocol-file target=clean/
 ant dir=protocol-ftp target=clean/
 ant dir=protocol-http target=clean/
 ant dir=protocol-httpclient target=clean/
+ant dir=parse-ext target=clean/
 ant dir=parse-html target=clean/
 ant dir=parse-js target=clean/
-ant dir=parse-text target=clean/
+ant dir=parse-mp3 target=clean/
+ant dir=parse-mspowerpoint target=clean/
+ant dir=parse-msword target=clean/
 ant dir=parse-pdf target=clean/
 ant dir=parse-rss target=clean/
-ant dir=parse-msword target=clean/
-ant dir=parse-mspowerpoint target=clean/
-ant dir=parse-mp3 target=clean/
 ant dir=parse-rtf target=clean/
-ant dir=parse-ext target=clean/
+ant dir=parse-text target=clean/
 ant dir=parse-zip target=clean/
-ant dir=index-basic target=clean

svn commit: r280551 - in /lucene/nutch/trunk/src/plugin: build.xml lib-lucene-analyzers/ lib-lucene-analyzers/build.xml lib-lucene-analyzers/lib/ lib-lucene-analyzers/lib/lucene-analyzers-1.9-rc1-dev.jar lib-lucene-analyzers/plugin.xml

2005-09-13 Thread jerome
Author: jerome
Date: Tue Sep 13 06:06:32 2005
New Revision: 280551

URL: http://svn.apache.org/viewcvs?rev=280551view=rev
Log:
Add a lib plugin for lucene analyzers

Added:
lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/
lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/build.xml   (with props)
lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/

lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-1.9-rc1-dev.jar
   (with props)
lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml   (with props)
Modified:
lucene/nutch/trunk/src/plugin/build.xml

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=280551r1=280550r2=280551view=diff
==
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Tue Sep 13 06:06:32 2005
@@ -12,6 +12,7 @@
  ant dir=index-more target=deploy/
  ant dir=languageidentifier target=deploy/
  ant dir=lib-jakarta-poi target=deploy/
+ ant dir=lib-lucene-analyzers target=deploy/
  ant dir=nutch-extensionpoints target=deploy/
  ant dir=ontology target=deploy/
  ant dir=protocol-file target=deploy/
@@ -66,6 +67,7 @@
 ant dir=index-more target=clean/
 ant dir=languageidentifier target=clean/
 ant dir=lib-jakarta-poi target=clean/
+ant dir=lib-lucene-analyzers target=clean/
 ant dir=nutch-extensionpoints target=clean/
 ant dir=ontology target=clean/
 ant dir=protocol-file target=clean/

Added: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/build.xml?rev=280551view=auto
==
--- lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/build.xml Tue Sep 13 
06:06:32 2005
@@ -0,0 +1,17 @@
+?xml version=1.0?
+
+project name=lib-lucene-analyzers default=jar
+
+  import file=../build-plugin.xml/
+
+  !--
+   ! Override the compile and jar targets,
+   ! since there is nothing to compile here.
+   ! --
+  target name=compile depends=init
+echo message=Compiling plugin: ${name}/
+  /target
+
+  target name=jar depends=compile/
+
+/project

Propchange: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/build.xml
--
svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-1.9-rc1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-1.9-rc1-dev.jar?rev=280551view=auto
==
Binary file - no diff available.

Propchange: 
lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-1.9-rc1-dev.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml?rev=280551view=auto
==
--- lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml Tue Sep 13 
06:06:32 2005
@@ -0,0 +1,21 @@
+?xml version=1.0 encoding=UTF-8?
+!--
+ ! Lucene Analyzers
+ ! (http://lucene.apache.org/java/docs/lucene-sandbox/)
+ !
+ ! Dowload : http://www.apache.org/dyn/closer.cgi/jakarta/lucene/binaries/
+ ! License : http://www.apache.org/licenses/LICENSE-2.0.txt
+ !--
+plugin
+   id=lib-lucene-analyzers
+   name=Lucene Analysers
+   version=1.9-rc1-dev
+   provider-name=org.apache.lucene
+
+   runtime
+ library name=lucene-analyzers-1.9-rc1-dev.jar
+export name=*/
+ /library
+   /runtime
+
+/plugin

Propchange: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml
--
svn:eol-style = native




svn commit: r280556 - in /lucene/nutch/trunk/src/plugin: ./ analysis-de/ analysis-de/src/ analysis-de/src/java/ analysis-de/src/java/org/ analysis-de/src/java/org/apache/ analysis-de/src/java/org/apache/nutch/ analysis-de/src/java/org/apache/nutch/anal...

2005-09-13 Thread jerome
Author: jerome
Date: Tue Sep 13 07:03:36 2005
New Revision: 280556

URL: http://svn.apache.org/viewcvs?rev=280556view=rev
Log:
French and German analyzers added

Added:
lucene/nutch/trunk/src/plugin/analysis-de/
lucene/nutch/trunk/src/plugin/analysis-de/build.xml   (with props)
lucene/nutch/trunk/src/plugin/analysis-de/plugin.xml   (with props)
lucene/nutch/trunk/src/plugin/analysis-de/src/
lucene/nutch/trunk/src/plugin/analysis-de/src/java/
lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/
lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/
lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/

lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/

lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/

lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/GermanAnalyzer.java
   (with props)
lucene/nutch/trunk/src/plugin/analysis-fr/
lucene/nutch/trunk/src/plugin/analysis-fr/build.xml   (with props)
lucene/nutch/trunk/src/plugin/analysis-fr/plugin.xml   (with props)
lucene/nutch/trunk/src/plugin/analysis-fr/src/
lucene/nutch/trunk/src/plugin/analysis-fr/src/java/
lucene/nutch/trunk/src/plugin/analysis-fr/src/java/org/
lucene/nutch/trunk/src/plugin/analysis-fr/src/java/org/apache/
lucene/nutch/trunk/src/plugin/analysis-fr/src/java/org/apache/nutch/

lucene/nutch/trunk/src/plugin/analysis-fr/src/java/org/apache/nutch/analysis/

lucene/nutch/trunk/src/plugin/analysis-fr/src/java/org/apache/nutch/analysis/fr/

lucene/nutch/trunk/src/plugin/analysis-fr/src/java/org/apache/nutch/analysis/fr/FrenchAnalyzer.java
   (with props)
Modified:
lucene/nutch/trunk/src/plugin/build.xml

Added: lucene/nutch/trunk/src/plugin/analysis-de/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/analysis-de/build.xml?rev=280556view=auto
==
--- lucene/nutch/trunk/src/plugin/analysis-de/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/analysis-de/build.xml Tue Sep 13 07:03:36 2005
@@ -0,0 +1,13 @@
+?xml version=1.0?
+
+project name=analysis-de default=jar
+
+  import file=../build-plugin.xml/
+
+  path id=plugin.deps
+fileset dir=../lib-lucene-analyzers/lib
+  include name=*.jar /
+/fileset
+  /path
+
+/project

Propchange: lucene/nutch/trunk/src/plugin/analysis-de/build.xml
--
svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/analysis-de/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/analysis-de/plugin.xml?rev=280556view=auto
==
--- lucene/nutch/trunk/src/plugin/analysis-de/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/analysis-de/plugin.xml Tue Sep 13 07:03:36 
2005
@@ -0,0 +1,29 @@
+?xml version=1.0 encoding=UTF-8?
+plugin
+   id=analysis-de
+   name=German Analysis Plug-in
+   version=1.0.0
+   provider-name=org.apache.nutch
+
+   runtime
+  library name=analysis-de.jar
+ export name=*/
+  /library
+   /runtime
+
+   requires
+  import plugin=nutch-extensionpoints/
+  import plugin=lib-lucene-analyzers/
+   /requires
+
+   extension id=org.apache.nutch.analysis.de
+  name=GermanAnalyzer
+  point=org.apache.nutch.analysis.NutchAnalyzer
+
+  implementation id=org.apache.nutch.analysis.de.GermanAnalyzer
+  class=org.apache.nutch.analysis.de.GermanAnalyzer
+  lang=de/
+
+   /extension
+
+/plugin

Propchange: lucene/nutch/trunk/src/plugin/analysis-de/plugin.xml
--
svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/GermanAnalyzer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/GermanAnalyzer.java?rev=280556view=auto
==
--- 
lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/GermanAnalyzer.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/GermanAnalyzer.java
 Tue Sep 13 07:03:36 2005
@@ -0,0 +1,48 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the License);
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY

svn commit: r280176 - in /lucene/nutch/trunk: conf/ src/java/org/apache/nutch/plugin/

2005-09-11 Thread jerome
Author: jerome
Date: Sun Sep 11 13:28:07 2005
New Revision: 280176

URL: http://svn.apache.org/viewcvs?rev=280176view=rev
Log:
Automatically loads active plugins dependencies (add a property, default is on)

Added:

lucene/nutch/trunk/src/java/org/apache/nutch/plugin/CircularDependencyException.java
   (with props)

lucene/nutch/trunk/src/java/org/apache/nutch/plugin/MissingDependencyException.java
   (with props)
Modified:
lucene/nutch/trunk/conf/nutch-default.xml

lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=280176r1=280175r2=280176view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Sun Sep 11 13:28:07 2005
@@ -586,8 +586,17 @@
 /property
 
 property
+  nameplugin.auto-activation/name
+  valuefalse/value
+  descriptionDefines if some plugins that are not activated regarding
+  the plugin.includes and plugin.excludes properties must be automaticaly
+  activated if they are needed by some actived plugins.
+  /description
+/property
+
+property
   nameplugin.includes/name
-  
valuenutch-extensionpoints|protocol-httpclient|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)/value
+  
valueprotocol-httpclient|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)/value
   descriptionRegular expression naming plugin directory names to
   include.  Any plugin not matching this expression is excluded.
   In any case you need at least include the nutch-extensionpoints plugin. By

Added: 
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/CircularDependencyException.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/plugin/CircularDependencyException.java?rev=280176view=auto
==
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/CircularDependencyException.java
 (added)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/CircularDependencyException.java
 Sun Sep 11 13:28:07 2005
@@ -0,0 +1,35 @@
+/*
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the License);
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+
+/**
+ * codeCircularDependencyException/code will be thrown if a circular
+ * dependency is detected.
+ * 
+ * @author Jeacute;rocirc;me Charron
+ */
+public class CircularDependencyException extends Exception {
+
+  public CircularDependencyException(Throwable cause) {
+super(cause);
+  }
+
+  public CircularDependencyException(String message) {
+super(message);
+  }
+}

Propchange: 
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/CircularDependencyException.java
--
svn:eol-style = native

Added: 
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/MissingDependencyException.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/plugin/MissingDependencyException.java?rev=280176view=auto
==
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/MissingDependencyException.java
 (added)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/MissingDependencyException.java
 Sun Sep 11 13:28:07 2005
@@ -0,0 +1,35 @@
+/*
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the License);
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+
+/**
+ * codeMissingDependencyException/code will be thrown if a plugin
+ * dependency cannot be found.
+ * 
+ * @author Jeacute;rocirc;me Charron

svn commit: r280179 - in /lucene/nutch/trunk/src/plugin: clustering-carrot2/ creativecommons/ index-basic/ index-more/ languageidentifier/ ontology/ parse-ext/ parse-html/ parse-js/ parse-mp3/ parse-mspowerpoint/ parse-msword/ parse-pdf/ parse-rss/ par...

2005-09-11 Thread jerome
Author: jerome
Date: Sun Sep 11 13:34:12 2005
New Revision: 280179

URL: http://svn.apache.org/viewcvs?rev=280179view=rev
Log:
Add a dependency to nutch-extensionpoints plugin

Modified:
lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml
lucene/nutch/trunk/src/plugin/creativecommons/plugin.xml
lucene/nutch/trunk/src/plugin/index-basic/plugin.xml
lucene/nutch/trunk/src/plugin/index-more/plugin.xml
lucene/nutch/trunk/src/plugin/languageidentifier/plugin.xml
lucene/nutch/trunk/src/plugin/ontology/plugin.xml
lucene/nutch/trunk/src/plugin/parse-ext/plugin.xml
lucene/nutch/trunk/src/plugin/parse-html/plugin.xml
lucene/nutch/trunk/src/plugin/parse-js/plugin.xml
lucene/nutch/trunk/src/plugin/parse-mp3/plugin.xml
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml
lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml
lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml
lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml
lucene/nutch/trunk/src/plugin/parse-rtf/plugin.xml
lucene/nutch/trunk/src/plugin/parse-text/plugin.xml
lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml
lucene/nutch/trunk/src/plugin/protocol-file/plugin.xml
lucene/nutch/trunk/src/plugin/protocol-ftp/plugin.xml
lucene/nutch/trunk/src/plugin/protocol-http/plugin.xml
lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml
lucene/nutch/trunk/src/plugin/query-basic/plugin.xml
lucene/nutch/trunk/src/plugin/query-more/plugin.xml
lucene/nutch/trunk/src/plugin/query-site/plugin.xml
lucene/nutch/trunk/src/plugin/query-url/plugin.xml
lucene/nutch/trunk/src/plugin/urlfilter-prefix/plugin.xml
lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml?rev=280179r1=280178r2=280179view=diff
==
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml Sun Sep 11 
13:34:12 2005
@@ -27,6 +27,10 @@
   library name=nekohtml-0.9.2.jar/

/runtime

 

+   requires

+  import plugin=nutch-extensionpoints/

+   /requires

+

extension id=org.apache.nutch.clustering.carrot2

   name=Carrot2 Clusterer

   point=org.apache.nutch.clustering.OnlineClusterer


Modified: lucene/nutch/trunk/src/plugin/creativecommons/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/plugin.xml?rev=280179r1=280178r2=280179view=diff
==
--- lucene/nutch/trunk/src/plugin/creativecommons/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/creativecommons/plugin.xml Sun Sep 11 
13:34:12 2005
@@ -11,6 +11,10 @@
   /library
/runtime
 
+   requires
+  import plugin=nutch-extensionpoints/
+   /requires
+
extension id=org.creativecommons.nutch.CCParseFilter
   name=Creative Commons Metadata Filter
   point=org.apache.nutch.parse.HtmlParseFilter

Modified: lucene/nutch/trunk/src/plugin/index-basic/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-basic/plugin.xml?rev=280179r1=280178r2=280179view=diff
==
--- lucene/nutch/trunk/src/plugin/index-basic/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/index-basic/plugin.xml Sun Sep 11 13:34:12 
2005
@@ -12,6 +12,10 @@
   /library
/runtime
 
+   requires
+  import plugin=nutch-extensionpoints/
+   /requires
+
extension id=org.apache.nutch.indexer.basic
   name=Nutch Basic Indexing Filter
   point=org.apache.nutch.indexer.IndexingFilter

Modified: lucene/nutch/trunk/src/plugin/index-more/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-more/plugin.xml?rev=280179r1=280178r2=280179view=diff
==
--- lucene/nutch/trunk/src/plugin/index-more/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/index-more/plugin.xml Sun Sep 11 13:34:12 2005
@@ -12,6 +12,10 @@
   /library
/runtime
 
+   requires
+  import plugin=nutch-extensionpoints/
+   /requires
+
extension id=org.apache.nutch.indexer.more
   name=Nutch More Indexing Filter
   point=org.apache.nutch.indexer.IndexingFilter

Modified: lucene/nutch/trunk/src/plugin/languageidentifier/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/plugin.xml?rev=280179r1=280178r2=280179view=diff
==
--- lucene/nutch/trunk/src/plugin/languageidentifier/plugin.xml (original

svn commit: r279286 - /lucene/nutch/trunk/conf/nutch-default.xml

2005-09-07 Thread jerome
Author: jerome
Date: Wed Sep  7 02:57:24 2005
New Revision: 279286

URL: http://svn.apache.org/viewcvs?rev=279286view=rev
Log:
Includes protocol-httpclient plugin (instead of protocol-http) and parse-js 
erroneously removed during commit of revision 233492

Modified:
lucene/nutch/trunk/conf/nutch-default.xml

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=279286r1=279285r2=279286view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Wed Sep  7 02:57:24 2005
@@ -587,7 +587,7 @@
 
 property
   nameplugin.includes/name
-  
valuenutch-extensionpoints|protocol-http|urlfilter-regex|parse-(text|html)|index-basic|query-(basic|site|url)/value
+  
valuenutch-extensionpoints|protocol-httpclient|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)/value
   descriptionRegular expression naming plugin directory names to
   include.  Any plugin not matching this expression is excluded.
   In any case you need at least include the nutch-extensionpoints plugin. By




svn commit: r279027 - /lucene/nutch/trunk/src/plugin/build.xml

2005-09-06 Thread jerome
Author: jerome
Date: Tue Sep  6 09:02:26 2005
New Revision: 279027

URL: http://svn.apache.org/viewcvs?rev=279027view=rev
Log:
lib-jakarta-poi added to the list of plugin de deploy/clean

Modified:
lucene/nutch/trunk/src/plugin/build.xml

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=279027r1=279026r2=279027view=diff
==
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Tue Sep  6 09:02:26 2005
@@ -6,6 +6,7 @@
   !-- Build  deploy all the plugin jars.--
   !-- == --
   target name=deploy
+ ant dir=lib-jakarta-poi target=deploy/
  ant dir=nutch-extensionpoints target=deploy/
  ant dir=protocol-file target=deploy/
  ant dir=protocol-ftp target=deploy/
@@ -59,6 +60,7 @@
   !-- Clean all of the plugins.  --
   !-- == --
   target name=clean
+ant dir=lib-jakarta-poi target=clean/
 ant dir=nutch-extensionpoints target=clean/
 ant dir=protocol-file target=clean/
 ant dir=protocol-ftp target=clean/




svn commit: r278626 - in /lucene/nutch/trunk/src/plugin: ./ parse-zip/ parse-zip/sample/ parse-zip/src/ parse-zip/src/java/ parse-zip/src/java/org/ parse-zip/src/java/org/apache/ parse-zip/src/java/org/apache/nutch/ parse-zip/src/java/org/apache/nutch/...

2005-09-04 Thread jerome
Author: jerome
Date: Sun Sep  4 13:53:49 2005
New Revision: 278626

URL: http://svn.apache.org/viewcvs?rev=278626view=rev
Log:
NUTCH-53, Parser plugin for Zip files (Rohit Kulkarni)

Added:
lucene/nutch/trunk/src/plugin/parse-zip/
lucene/nutch/trunk/src/plugin/parse-zip/build.xml   (with props)
lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml   (with props)
lucene/nutch/trunk/src/plugin/parse-zip/sample/
lucene/nutch/trunk/src/plugin/parse-zip/sample/test.zip   (with props)
lucene/nutch/trunk/src/plugin/parse-zip/src/
lucene/nutch/trunk/src/plugin/parse-zip/src/java/
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/

lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
   (with props)

lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
   (with props)
lucene/nutch/trunk/src/plugin/parse-zip/src/test/
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/

lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
   (with props)
Modified:
lucene/nutch/trunk/src/plugin/build.xml

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=278626r1=278625r2=278626view=diff
==
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Sun Sep  4 13:53:49 2005
@@ -21,6 +21,7 @@
 !-- ant dir=parse-mp3 target=deploy/ --
 !-- ant dir=parse-rtf target=deploy/ --
  ant dir=parse-ext target=deploy/
+ ant dir=parse-zip target=deploy/
  ant dir=index-basic target=deploy/
  ant dir=index-more target=deploy/
  ant dir=query-basic target=deploy/
@@ -48,6 +49,7 @@
  !-- ant dir=parse-mp3 target=test/ --
  !-- ant dir=parse-rtf target=test/ --
  ant dir=parse-ext target=test/
+ ant dir=parse-zip target=test/
  ant dir=creativecommons target=test/
  ant dir=languageidentifier target=test/
  ant dir=ontology target=test/
@@ -72,6 +74,7 @@
 ant dir=parse-mp3 target=clean/
 ant dir=parse-rtf target=clean/
 ant dir=parse-ext target=clean/
+ant dir=parse-zip target=clean/
 ant dir=index-basic target=clean/
 ant dir=index-more target=clean/
 ant dir=query-basic target=clean/

Added: lucene/nutch/trunk/src/plugin/parse-zip/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/build.xml?rev=278626view=auto
==
--- lucene/nutch/trunk/src/plugin/parse-zip/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/parse-zip/build.xml Sun Sep  4 13:53:49 2005
@@ -0,0 +1,15 @@
+?xml version=1.0?
+
+project name=parse-zip default=jar
+
+  import file=../build-plugin.xml/
+  
+   !-- for junit test --
+   mkdir dir=${build.test}/data /
+   copy todir=${build.test}/data
+   fileset dir=sample
+   include name=*.zip /
+   /fileset
+   /copy
+
+/project

Propchange: lucene/nutch/trunk/src/plugin/parse-zip/build.xml
--
svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml?rev=278626view=auto
==
--- lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml Sun Sep  4 13:53:49 2005
@@ -0,0 +1,24 @@
+?xml version=1.0 encoding=UTF-8?
+plugin
+   id=parse-zip
+   name=Zip Parse Plug-in
+   version=1.0.0
+   provider-name=nutch.org
+
+   runtime
+  library name=parse-zip.jar
+ export name=*/
+  /library
+   /runtime
+
+   extension id=org.apache.nutch.parse.zip
+  name=ZipParser 
+  point=org.apache.nutch.parse.Parser
+
+  implementation id=org.apache.nutch.parse.zip.ZipParser 
+  class=org.apache.nutch.parse.zip.ZipParser 
+  contentType=application/zip
+  pathSuffix=zip/
+   /extension
+
+/plugin

Propchange: lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml

svn commit: r265794 - in /lucene/nutch/trunk: lib/commons-lang-2.1.jar src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

2005-09-01 Thread jerome
Author: jerome
Date: Thu Sep  1 15:20:51 2005
New Revision: 265794

URL: http://svn.apache.org/viewcvs?rev=265794view=rev
Log:
NUTCH-65, Handles more modification-date format

Added:
lucene/nutch/trunk/lib/commons-lang-2.1.jar   (with props)
Modified:

lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

Added: lucene/nutch/trunk/lib/commons-lang-2.1.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/commons-lang-2.1.jar?rev=265794view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/commons-lang-2.1.jar
--
svn:mime-type = application/octet-stream

Modified: 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=265794r1=265793r2=265794view=diff
==
--- 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 Thu Sep  1 15:20:51 2005
@@ -53,7 +53,7 @@
 import java.util.Enumeration;
 import java.util.Properties;
 
-
+import org.apache.commons.lang.time.DateUtils;
 /**
  * Add (or reset) a few metaData properties as respective fields
  * (if they are available), so that they can be displayed by more.jsp
@@ -133,21 +133,37 @@
 try {
   time = HttpDateFormat.toLong(date);
 } catch (ParseException e) {
-  // try to parse it as date in alternative format
-  String date2 = date;
-  try {
-if (date.length()  25 ) date2 = date.substring(0, 25);
-DateFormat df = new SimpleDateFormat(EEE, dd MMM  HH:mm:ss, 
Locale.US);
-time = df.parse(date2).getTime();
-  } catch (Exception e1) {
-try {
-  if (date.length()  24 ) date2 = date.substring(0, 24);
-  DateFormat df = new SimpleDateFormat(EEE MMM dd HH:mm:ss , 
Locale.US);
-  time = df.parse(date2).getTime();
-} catch (Exception e2) {
-  LOG.warning(url + : can't parse erroneous date:  + date);
-}
-  }
+   // try to parse it as date in alternative format
+   try {
+   Date parsedDate = DateUtils.parseDate(date,
+ new String [] {
+ EEE MMM dd HH:mm:ss ,
+ EEE MMM dd HH:mm:ss  zzz,
+ EEE, MMM dd HH:mm:ss  zzz,
+ EEE, dd MMM  HH:mm:ss zzz,
+ EEE,dd MMM  HH:mm:ss zzz,
+ EEE, dd MMM  HH:mm:sszzz,
+ EEE, dd MMM  HH:mm:ss,
+ EEE, dd-MMM-yy HH:mm:ss zzz,
+ /MM/dd HH:mm:ss.SSS zzz,
+ /MM/dd HH:mm:ss.SSS,
+ /MM/dd HH:mm:ss zzz,
+ /MM/dd,
+ .MM.dd HH:mm:ss,
+ -MM-dd HH:mm,
+ MMM dd  HH:mm:ss. zzz,
+ MMM dd  HH:mm:ss zzz,
+ dd.MM. HH:mm:ss zzz,
+ dd MM  HH:mm:ss zzz,
+ dd.MM.; HH:mm:ss,
+ dd.MM. HH:mm:ss,
+ dd.MM. zzz
+ });
+   time = parsedDate.getTime();
+   //  LOG.warning(url + : parsed date:  + date + to:+time);
+   } catch (Exception e2) {
+   LOG.warning(url + : can't parse erroneous date:  + date);
+   }
 }
 return time;
   }




svn commit: r264964 - /lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java

2005-08-31 Thread jerome
Author: jerome
Date: Wed Aug 31 01:04:52 2005
New Revision: 264964

URL: http://svn.apache.org/viewcvs?rev=264964view=rev
Log:
No more NullPointerException while logging the doc language if none

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java?rev=264964r1=264963r2=264964view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java Wed 
Aug 31 01:04:52 2005
@@ -145,8 +145,9 @@
 
   // add the document to the index
   NutchAnalyzer analyzer = AnalyzerFactory.get(doc.get(lang));
-  LOG.info( Indexing [ + doc.getField(url).stringValue() +
-   ] with analyzer  + analyzer +  ( + 
doc.getField(lang).stringValue() + ));
+  LOG.info( Indexing [ + doc.getField(url).stringValue() + ] 
+
+with analyzer  + analyzer +
+( + doc.get(lang) + ));
   //LOG.info( Doc is  + doc);
   writer.addDocument(doc, analyzer);
   if (count  0  count % LOG_STEP == 0) {




svn commit: r265020 - /lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java

2005-08-31 Thread jerome
Author: jerome
Date: Wed Aug 31 04:38:28 2005
New Revision: 265020

URL: http://svn.apache.org/viewcvs?rev=265020view=rev
Log:
Fixes some typo (analySer = analyZer)

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java?rev=265020r1=265019r2=265020view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java 
Wed Aug 31 04:38:28 2005
@@ -44,7 +44,7 @@
 
   private final static Map CACHE = new HashMap();
 
-  private final static NutchAnalyzer DEFAULT_ANALYSER = 
+  private final static NutchAnalyzer DEFAULT_ANALYZER = 
 new NutchDocumentAnalyzer();
   
   
@@ -60,22 +60,22 @@
 
   
   /**
-   * Returns the appropriate [EMAIL PROTECTED] Analyser} implementation given 
a language
-   * code.
+   * Returns the appropriate [EMAIL PROTECTED] NutchAnalyzer analyzer} 
implementation
+   * given a language code.
*
-   * pNutchAnalyser extensions should define the attribute lang. The first
+   * pNutchAnalyzer extensions should define the attribute lang. The first
* plugin found whose lang attribute equals the specified lang parameter is
* used. If none match, then the [EMAIL PROTECTED] NutchDocumentAnalyzer} is 
used.
*/
   public static NutchAnalyzer get(String lang) {
 
-NutchAnalyzer analyzer = DEFAULT_ANALYSER;
+NutchAnalyzer analyzer = DEFAULT_ANALYZER;
 Extension extension = getExtension(lang);
 if (extension != null) {
 try {
 analyzer = (NutchAnalyzer) extension.getExtensionInstance();
 } catch (PluginRuntimeException pre) {
-analyzer = DEFAULT_ANALYSER;
+analyzer = DEFAULT_ANALYZER;
 }
 }
 return analyzer;




svn commit: r265503 - in /lucene/nutch/trunk/src: java/org/apache/nutch/clustering/ java/org/apache/nutch/fs/ java/org/apache/nutch/mapReduce/ java/org/apache/nutch/parse/ java/org/apache/nutch/protoc

2005-08-31 Thread jerome
Author: jerome
Date: Wed Aug 31 08:17:11 2005
New Revision: 265503

URL: http://svn.apache.org/viewcvs?rev=265503view=rev
Log:
Merged 0.7 branch changes 240321:240453 into trunk

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java
lucene/nutch/trunk/src/java/org/apache/nutch/fs/NutchFileSystem.java
lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/FileSplit.java
lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/MapOutputFile.java
lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/RecordReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/package.html
lucene/nutch/trunk/src/java/org/apache/nutch/parse/Parse.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolException.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ResourceGone.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ResourceMoved.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/RetryLater.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/Daemon.java

lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java

lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java

lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java

lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java?rev=265503r1=265502r2=265503view=diff
==
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java 
(original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java 
Wed Aug 31 08:17:11 2005
@@ -23,8 +23,8 @@
  * algorithms.
  *
  * pBy the term bonline/b search results clustering we will understand
- * a clusterer that works on a set of [EMAIL PROTECTED] Hit}s retrieved for a 
user's query
- * and produces a set of [EMAIL PROTECTED] Clusters} that can be displayed to 
help
+ * a clusterer that works on a set of [EMAIL PROTECTED] HitDetails} retrieved 
for a user's
+ * query and produces a set of [EMAIL PROTECTED] HitsCluster} that can be 
displayed to help
  * the user gain insight in the topics found in the result./p
  *
  * pOther clustering options include predefined categories and off-line

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fs/NutchFileSystem.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fs/NutchFileSystem.java?rev=265503r1=265502r2=265503view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/fs/NutchFileSystem.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fs/NutchFileSystem.java Wed 
Aug 31 08:17:11 2005
@@ -80,8 +80,8 @@
   return getNamed(NutchConf.get().get(fs.default.name, local));
 }
 
-/** Returns a name for this filesystem, suitable to pass to [EMAIL 
PROTECTED]
- * NutchFileSystem#getNamed(String).*/
+/** Returns a name for this filesystem, suitable to pass to
+ * [EMAIL PROTECTED] NutchFileSystem#getNamed(String)}.*/
 public abstract String getName();
   
 /** Returns a named filesystem.  Names are either the string local or a

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/FileSplit.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/FileSplit.java?rev=265503r1=265502r2=265503view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/FileSplit.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/FileSplit.java Wed 
Aug 31 08:17:11 2005
@@ -25,9 +25,12 @@
 import org.apache.nutch.io.UTF8;
 import org.apache.nutch.fs.NutchFileSystem;
 
-/** A section of an input file.  Returned by [EMAIL PROTECTED]
- * InputFormat#getSplits(File[], int)} and passed to
- * InputFormat#getRecordReader(FileSplit). */
+/**
+ * A section of an input file.
+ * Returned by [EMAIL PROTECTED] InputFormat#getSplits(NutchFileSystem, 
JobConf, int)}
+ * and passed to
+ * [EMAIL PROTECTED] InputFormat#getRecordReader(NutchFileSystem, FileSplit, 
JobConf)}.
+ */
 public class FileSplit implements Writable {
   private File file;
   private long start;

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch

svn commit: r240254 - in /lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang: HTMLLanguageParser.java LanguageIdentifier.java LanguageIndexingFilter.java LanguageQueryFilter.java NGramProfile.java

2005-08-26 Thread jerome
Author: jerome
Date: Fri Aug 26 07:54:16 2005
New Revision: 240254

URL: http://svn.apache.org/viewcvs?rev=240254view=rev
Log:
Javadoc updates, corrections on input stream reading

Modified:

lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java

lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java

lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java

lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java

lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java

Modified: 
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=240254r1=240253r2=240254view=diff
==
--- 
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
 (original)
+++ 
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
 Fri Aug 26 07:54:16 2005
@@ -23,20 +23,37 @@
 import java.util.logging.Logger;
 import org.apache.nutch.util.LogFormatter;
 
-/** Adds metadata identifying language of document if found
- * We could also run statistical analysis here but we'd miss all other formats
+/**
+ * An [EMAIL PROTECTED] org.apache.nutch.parse.HtmlParseFilter} that looks for 
possible
+ * indications of content language.
+ *
+ * If some indication is found, it is added in the [EMAIL PROTECTED] 
#META_LANG_NAME}
+ * attribute of the [EMAIL PROTECTED] org.apache.nutch.parse.ParseData} 
metadata.
+ *
+ * @author Sami Siren
+ * @author Jerome Charron
  */
 public class HTMLLanguageParser implements HtmlParseFilter {
+
+  /** The language meta data attribute name */
   public static final String META_LANG_NAME=X-meta-lang;
-  public static final Logger LOG = LogFormatter
+  
+  private static final Logger LOG = LogFormatter
 .getLogger(HTMLLanguageParser.class.getName());
 
   /**
-   * Scan the HTML document looking at possible indications of content 
languagebr
-   * li1. html lang attribute 
(http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1)
-   * li2. meta dc.language 
(http://dublincore.org/documents/2000/07/16/usageguide/qualified-html.shtml#language)
-   * li3. meta http-equiv (content-language) 
(http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2)
-   * brOnly the first occurence of language is stored.
+   * Scan the HTML document looking at possible indications of content 
language.
+   * ol
+   * lihtml lang attribute
+   * (a href=http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1;
+   * http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1/a),/li
+   * limeta dc.language (a 
href=http://dublincore.org/documents/2000/07/16/usageguide/qualified-html.shtml#language;
+   * 
http://dublincore.org/documents/2000/07/16/usageguide/qualified-html.shtml#language/a),/li
+   * limeta http-equiv (content-language) (
+   * a 
href=http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2;
+   * 
http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2/a)./li
+   * /ol
+   * Only the first occurence of language is stored.
*/
   public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, 
DocumentFragment doc) {
 String lang = findLanguage(doc);

Modified: 
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?rev=240254r1=240253r2=240254view=diff
==
--- 
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
 (original)
+++ 
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
 Fri Aug 26 07:54:16 2005
@@ -20,6 +20,7 @@
 import java.io.InputStream;
 import java.io.IOException;
 import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
 import java.io.FileInputStream;
 import java.io.InputStreamReader;
 import java.util.List;
@@ -48,6 +49,10 @@
 
 
 /**
+ * Identify the language of a content, based on statistical analysis.
+ *
+ * @see a href=http://www.w3.org/WAI/ER/IG/ert/iso639.htm;ISO 639
+ *  Language Codes

svn commit: r240345 - in /lucene/nutch/branches/Release-0.7/src: java/org/apache/nutch/clustering/ java/org/apache/nutch/fs/ java/org/apache/nutch/mapReduce/ java/org/apache/nutch/parse/ java/org/apache/nutch/protocol/ java/org/apache/nutch/searcher/ j...

2005-08-26 Thread jerome
Author: jerome
Date: Fri Aug 26 14:03:19 2005
New Revision: 240345

URL: http://svn.apache.org/viewcvs?rev=240345view=rev
Log:
NUTCH-37, no more javadoc warnings

Modified:

lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/clustering/OnlineClusterer.java

lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/fs/NutchFileSystem.java

lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/mapReduce/FileSplit.java

lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/mapReduce/MapOutputFile.java

lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/mapReduce/RecordReader.java

lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/mapReduce/package.html
lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/parse/Parse.java

lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/protocol/Content.java

lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/protocol/ProtocolException.java

lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/protocol/ResourceGone.java

lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/protocol/ResourceMoved.java

lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/protocol/RetryLater.java

lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/searcher/Hits.java

lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/segment/SegmentReader.java
lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/util/Daemon.java

lucene/nutch/branches/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java

lucene/nutch/branches/Release-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java

lucene/nutch/branches/Release-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java

lucene/nutch/branches/Release-0.7/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java

Modified: 
lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/clustering/OnlineClusterer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/clustering/OnlineClusterer.java?rev=240345r1=240344r2=240345view=diff
==
--- 
lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/clustering/OnlineClusterer.java
 (original)
+++ 
lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/clustering/OnlineClusterer.java
 Fri Aug 26 14:03:19 2005
@@ -23,8 +23,8 @@
  * algorithms.
  *
  * pBy the term bonline/b search results clustering we will understand
- * a clusterer that works on a set of [EMAIL PROTECTED] Hit}s retrieved for a 
user's query
- * and produces a set of [EMAIL PROTECTED] Clusters} that can be displayed to 
help
+ * a clusterer that works on a set of [EMAIL PROTECTED] HitDetails} retrieved 
for a user's
+ * query and produces a set of [EMAIL PROTECTED] HitsCluster} that can be 
displayed to help
  * the user gain insight in the topics found in the result./p
  *
  * pOther clustering options include predefined categories and off-line

Modified: 
lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/fs/NutchFileSystem.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/fs/NutchFileSystem.java?rev=240345r1=240344r2=240345view=diff
==
--- 
lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/fs/NutchFileSystem.java
 (original)
+++ 
lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/fs/NutchFileSystem.java
 Fri Aug 26 14:03:19 2005
@@ -80,8 +80,8 @@
   return getNamed(NutchConf.get().get(fs.default.name, local));
 }
 
-/** Returns a name for this filesystem, suitable to pass to [EMAIL 
PROTECTED]
- * NutchFileSystem#getNamed(String).*/
+/** Returns a name for this filesystem, suitable to pass to
+ * [EMAIL PROTECTED] NutchFileSystem#getNamed(String)}.*/
 public abstract String getName();
   
 /** Returns a named filesystem.  Names are either the string local or a

Modified: 
lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/mapReduce/FileSplit.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/mapReduce/FileSplit.java?rev=240345r1=240344r2=240345view=diff
==
--- 
lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/mapReduce/FileSplit.java
 (original)
+++ 
lucene/nutch/branches/Release-0.7/src/java/org/apache/nutch/mapReduce/FileSplit.java
 Fri Aug 26 14:03:19 2005
@@ -25,9 +25,12 @@
 import org.apache.nutch.io.UTF8;
 import org.apache.nutch.fs.NutchFileSystem;
 
-/** A section of an input file.  Returned by [EMAIL PROTECTED]
- * InputFormat

svn commit: r240359 - in /lucene/nutch/trunk/src: java/org/apache/nutch/analysis/ java/org/apache/nutch/indexer/ plugin/nutch-extensionpoints/

2005-08-26 Thread jerome
Author: jerome
Date: Fri Aug 26 15:47:04 2005
New Revision: 240359

URL: http://svn.apache.org/viewcvs?rev=240359view=rev
Log:
Add an analysis extension point

Added:
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java  
 (with props)
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java   
(with props)
Modified:

lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml

Added: 
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java?rev=240359view=auto
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java 
(added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java 
Fri Aug 26 15:47:04 2005
@@ -0,0 +1,107 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the License);
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.analysis;
+
+// JDK imports
+import java.util.HashMap;
+import java.util.Map;
+import java.util.logging.Logger;
+
+// Nutch imports
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.ExtensionPoint;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.plugin.PluginRuntimeException;
+import org.apache.nutch.util.LogFormatter;
+
+
+/**
+ * Creates and caches [EMAIL PROTECTED] NutchAnalyzer} plugins.
+ *
+ * @author Jeacute;rocirc;me Charron
+ */
+public class AnalyzerFactory {
+
+  public final static Logger LOG =
+  LogFormatter.getLogger(AnalyzerFactory.class.getName());
+
+  private final static ExtensionPoint X_POINT = 
+  PluginRepository.getInstance()
+  .getExtensionPoint(NutchAnalyzer.X_POINT_ID);
+
+  private final static Map CACHE = new HashMap();
+
+  private final static NutchAnalyzer DEFAULT_ANALYSER = 
+new NutchDocumentAnalyzer();
+  
+  
+  static {
+if (X_POINT == null) {
+  throw new RuntimeException(x point  + NutchAnalyzer.X_POINT_ID +
+  not found.);
+}
+  }
+
+
+  private AnalyzerFactory() {}
+
+  
+  /**
+   * Returns the appropriate [EMAIL PROTECTED] Analyser} implementation given 
a language
+   * code.
+   *
+   * pNutchAnalyser extensions should define the attribute lang. The first
+   * plugin found whose lang attribute equals the specified lang parameter is
+   * used. If none match, then the [EMAIL PROTECTED] NutchDocumentAnalyzer} is 
used.
+   */
+  public static NutchAnalyzer get(String lang) {
+
+NutchAnalyzer analyzer = DEFAULT_ANALYSER;
+Extension extension = getExtension(lang);
+if (extension != null) {
+try {
+analyzer = (NutchAnalyzer) extension.getExtensionInstance();
+} catch (PluginRuntimeException pre) {
+analyzer = DEFAULT_ANALYSER;
+}
+}
+return analyzer;
+  }
+
+  private static Extension getExtension(String lang) {
+
+Extension extension = (Extension) CACHE.get(lang);
+if (extension == null) {
+  extension = findExtension(lang);
+  CACHE.put(lang, extension);
+}
+return extension;
+  }
+
+  private static Extension findExtension(String lang) {
+
+if (lang != null) {
+  Extension[] extensions = X_POINT.getExtentens();
+  for (int i=0; iextensions.length; i++) {
+if (lang.equals(extensions[i].getAttribute(lang))) {
+  return extensions[i];
+}
+  }
+}
+return null;
+  }
+
+}

Propchange: 
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java
--
svn:eol-style = native

Added: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java?rev=240359view=auto
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java 
(added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java 
Fri Aug 26 15:47:04 2005

svn commit: r233492 - in /lucene/nutch/trunk: conf/ src/plugin/ src/plugin/clustering-carrot2/ src/plugin/creativecommons/ src/plugin/index-basic/ src/plugin/index-more/ src/plugin/languageidentifier/ src/plugin/nutch-extensionpoints/ src/plugin/nutch-...

2005-08-19 Thread jerome
Author: jerome
Date: Fri Aug 19 08:55:46 2005
New Revision: 233492

URL: http://svn.apache.org/viewcvs?rev=233492view=rev
Log:
NUTCH-10, extension points defined only once (Stefan Grroschupf)

Added:
lucene/nutch/trunk/src/plugin/nutch-extensionpoints/
lucene/nutch/trunk/src/plugin/nutch-extensionpoints/build.xml   (with props)
lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml   (with 
props)
lucene/nutch/trunk/src/plugin/nutch-extensionpoints/src/
lucene/nutch/trunk/src/plugin/nutch-extensionpoints/src/java/
Modified:
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/plugin/build.xml
lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml
lucene/nutch/trunk/src/plugin/creativecommons/plugin.xml
lucene/nutch/trunk/src/plugin/index-basic/plugin.xml
lucene/nutch/trunk/src/plugin/index-more/plugin.xml
lucene/nutch/trunk/src/plugin/languageidentifier/plugin.xml
lucene/nutch/trunk/src/plugin/ontology/plugin.xml
lucene/nutch/trunk/src/plugin/parse-ext/plugin.xml
lucene/nutch/trunk/src/plugin/parse-html/plugin.xml
lucene/nutch/trunk/src/plugin/parse-js/plugin.xml
lucene/nutch/trunk/src/plugin/parse-mp3/plugin.xml
lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml
lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml
lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml
lucene/nutch/trunk/src/plugin/parse-rtf/plugin.xml
lucene/nutch/trunk/src/plugin/parse-text/plugin.xml
lucene/nutch/trunk/src/plugin/protocol-file/plugin.xml
lucene/nutch/trunk/src/plugin/protocol-ftp/plugin.xml
lucene/nutch/trunk/src/plugin/protocol-http/plugin.xml
lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml
lucene/nutch/trunk/src/plugin/query-basic/plugin.xml
lucene/nutch/trunk/src/plugin/query-more/plugin.xml
lucene/nutch/trunk/src/plugin/query-site/plugin.xml
lucene/nutch/trunk/src/plugin/query-url/plugin.xml
lucene/nutch/trunk/src/plugin/urlfilter-prefix/plugin.xml
lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=233492r1=233491r2=233492view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Fri Aug 19 08:55:46 2005
@@ -587,9 +587,10 @@
 
 property
   nameplugin.includes/name
-  
valueprotocol-httpclient|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)/value
+  
valuenutch-extensionpoints|protocol-http|urlfilter-regex|parse-(text|html)|index-basic|query-(basic|site|url)/value
   descriptionRegular expression naming plugin directory names to
-  include.  Any plugin not matching this expression is excluded.  By
+  include.  Any plugin not matching this expression is excluded.
+  In any case you need at least include the nutch-extensionpoints plugin. By
   default Nutch includes crawling just HTML and plain text via HTTP,
   and basic indexing and search plugins.
   /description

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=233492r1=233491r2=233492view=diff
==
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Fri Aug 19 08:55:46 2005
@@ -6,6 +6,7 @@
   !-- Build  deploy all the plugin jars.--
   !-- == --
   target name=deploy
+ ant dir=nutch-extensionpoints target=deploy/
  ant dir=protocol-file target=deploy/
  ant dir=protocol-ftp target=deploy/
  ant dir=protocol-http target=deploy/
@@ -54,6 +55,7 @@
   !-- Clean all of the plugins.  --
   !-- == --
   target name=clean
+ant dir=nutch-extensionpoints target=clean/
 ant dir=protocol-file target=clean/
 ant dir=protocol-ftp target=clean/
 ant dir=protocol-http target=clean/

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml?rev=233492r1=233491r2=233492view=diff
==
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml Fri Aug 19 
08:55:46 2005
@@ -5,10 +5,6 @@
version=0.9.0

provider-name=carrot2.sourceforge.net

 

-   extension-point

-  id=org.apache.nutch.clustering.OnlineClusterer

-  name=Nutch Online Search Results Clustering Plugin/

-

runtime

   library name=clustering-carrot2.jar

  export name=*/


Modified: lucene/nutch

svn commit: r233544 - /lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java

2005-08-19 Thread jerome
Author: jerome
Date: Fri Aug 19 12:26:14 2005
New Revision: 233544

URL: http://svn.apache.org/viewcvs?rev=233544view=rev
Log:
Correction in LanguageIdentifier unit test

Modified:

lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java

Modified: 
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java?rev=233544r1=233543r2=233544view=diff
==
--- 
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java
 Fri Aug 19 12:26:14 2005
@@ -223,7 +223,7 @@
 String testLine = null;
 while((testLine = testFile.readLine()) != null) {
 testLine = testLine.trim();
-if (testLine.length()  64) {
+if (testLine.length()  256) {
 lang = idfr.identify(testLine);
 assertEquals(tokens[1], lang);
 }




svn commit: r233559 - in /lucene/nutch/trunk/src: java/org/apache/nutch/parse/ plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ plugin/parse-msword/src/java/org/apache/nutch/parse/msword/ plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/ plugi...

2005-08-19 Thread jerome
Author: jerome
Date: Fri Aug 19 14:15:02 2005
New Revision: 233559

URL: http://svn.apache.org/viewcvs?rev=233559view=rev
Log:
* Add utility to extract urls from plain text (Stephan Strittmatter)
* Uses the OutlinkExtractor in parse plugins PDF, MSWord, Text, RTF, Ext

Added:
lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java   
(with props)

lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java   
(with props)
Modified:

lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java

lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java

lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java

lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java

lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java

Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?rev=233559view=auto
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java 
(added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java 
Fri Aug 19 14:15:02 2005
@@ -0,0 +1,227 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the License);
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import org.apache.nutch.util.LogFormatter;
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.Pattern;
+import org.apache.oro.text.regex.PatternCompiler;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.PatternMatcherInput;
+import org.apache.oro.text.regex.Perl5Compiler;
+import org.apache.oro.text.regex.Perl5Matcher;
+
+/**
+ * Extractor to extract [EMAIL PROTECTED] org.apache.nutch.parse.Outlink}s 
+ * / URLs from plain text using Regular Expressions.
+ * 
+ * @see a
+ *  
href=http://wiki.java.net/bin/view/Javapedia/RegularExpressions;Comparison
+ *  of different regexp-Implementations /a
+ * @see a href=http://regex.info/java.html;Overview about Java Regexp APIs
+ *  /a
+ * 
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * @version 1.0
+ * @since 0.7
+ */
+public class OutlinkExtractor {
+  private static final Logger LOG = LogFormatter
+  .getLogger(OutlinkExtractor.class.getName());
+
+  /**
+   * Regex pattern to get URLs within a plain text.
+   * 
+   * @see a
+   *  
href=http://www.truerwords.net/articles/ut/urlactivation.html;http://www.truerwords.net/articles/ut/urlactivation.html
+   *  /a
+   */
+  private static final String URL_PATTERN = 
+
([A-Za-z][A-Za-z0-9+.-]+:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@~=%-]*))?);
+
+  /**
+   * Extracts codeOutlink/code from given plain text.
+   * 
+   * @param plainText  the plain text from wich URLs should be extracted.
+   * 
+   * @return Array of codeOutlink/codes within found in plainText
+   */
+  public static Outlink[] getOutlinks(final String plainText) {
+return OutlinkExtractor.getOutlinks(plainText, );
+  }
+
+  /**
+   * Extracts codeOutlink/code from given plain text and adds anchor
+   * to the extracted codeOutlink/codes
+   * 
+   * @param plainText the plain text from wich URLs should be extracted.
+   * @param anchorthe anchor of the url
+   * 
+   * @return Array of codeOutlink/codes within found in plainText
+   */
+  public static Outlink[] getOutlinks(final String plainText, String anchor) {
+
+final List outlinks = new ArrayList();
+
+try {
+  final PatternCompiler cp = new Perl5Compiler();
+  final Pattern pattern = cp.compile(URL_PATTERN,
+  Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+  | Perl5Compiler.MULTILINE_MASK);
+  final PatternMatcher matcher = new Perl5Matcher();
+
+  final PatternMatcherInput input = new PatternMatcherInput(plainText);
+
+  MatchResult result;
+  String url;
+
+  //loop the matches
+  while (matcher.contains(input, pattern)) {
+result = matcher.getMatch

svn commit: r233312 - in /lucene/nutch/trunk: docs/ca/ docs/de/ docs/en/ docs/es/ docs/fi/ docs/fr/ docs/hu/ docs/jp/ docs/ms/ docs/nl/ docs/pl/ docs/pt/ docs/sv/ docs/th/ docs/zh/ src/web/jsp/ src/web/style/

2005-08-18 Thread jerome
Author: jerome
Date: Thu Aug 18 05:17:22 2005
New Revision: 233312

URL: http://svn.apache.org/viewcvs?rev=233312view=rev
Log:
Gives focus to search query input

Modified:
lucene/nutch/trunk/docs/ca/about.html
lucene/nutch/trunk/docs/ca/help.html
lucene/nutch/trunk/docs/ca/search.html
lucene/nutch/trunk/docs/de/about.html
lucene/nutch/trunk/docs/de/help.html
lucene/nutch/trunk/docs/de/search.html
lucene/nutch/trunk/docs/en/about.html
lucene/nutch/trunk/docs/en/help.html
lucene/nutch/trunk/docs/en/search.html
lucene/nutch/trunk/docs/es/about.html
lucene/nutch/trunk/docs/es/help.html
lucene/nutch/trunk/docs/es/search.html
lucene/nutch/trunk/docs/fi/about.html
lucene/nutch/trunk/docs/fi/help.html
lucene/nutch/trunk/docs/fi/search.html
lucene/nutch/trunk/docs/fr/about.html
lucene/nutch/trunk/docs/fr/search.html
lucene/nutch/trunk/docs/hu/about.html
lucene/nutch/trunk/docs/hu/help.html
lucene/nutch/trunk/docs/hu/search.html
lucene/nutch/trunk/docs/jp/about.html
lucene/nutch/trunk/docs/jp/help.html
lucene/nutch/trunk/docs/jp/search.html
lucene/nutch/trunk/docs/ms/about.html
lucene/nutch/trunk/docs/ms/help.html
lucene/nutch/trunk/docs/ms/search.html
lucene/nutch/trunk/docs/nl/about.html
lucene/nutch/trunk/docs/nl/help.html
lucene/nutch/trunk/docs/nl/search.html
lucene/nutch/trunk/docs/pl/about.html
lucene/nutch/trunk/docs/pl/help.html
lucene/nutch/trunk/docs/pl/search.html
lucene/nutch/trunk/docs/pt/about.html
lucene/nutch/trunk/docs/pt/help.html
lucene/nutch/trunk/docs/pt/search.html
lucene/nutch/trunk/docs/sv/about.html
lucene/nutch/trunk/docs/sv/help.html
lucene/nutch/trunk/docs/sv/search.html
lucene/nutch/trunk/docs/th/about.html
lucene/nutch/trunk/docs/th/help.html
lucene/nutch/trunk/docs/th/search.html
lucene/nutch/trunk/docs/zh/about.html
lucene/nutch/trunk/docs/zh/help.html
lucene/nutch/trunk/docs/zh/search.html
lucene/nutch/trunk/src/web/jsp/search.jsp
lucene/nutch/trunk/src/web/style/nutch-page.xsl

Modified: lucene/nutch/trunk/docs/ca/about.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/docs/ca/about.html?rev=233312r1=233311r2=233312view=diff
==
--- lucene/nutch/trunk/docs/ca/about.html (original)
+++ lucene/nutch/trunk/docs/ca/about.html Thu Aug 18 05:17:22 2005
@@ -19,8 +19,12 @@
 /style
 link type=image/x-icon href=../img/favicon.ico rel=icon
 link type=image/x-icon href=../img/favicon.ico rel=shortcut icon
+script type=text/javascript
+!--
+function queryfocus() { document.search.query.focus(); }
+// --/script
 /head
-body
+body onLoad=queryfocus();
 !--This file is automatically generated.  Do not edit!--
 table cellspacing=0 cellpadding=0 border=0 width=635
 tr

Modified: lucene/nutch/trunk/docs/ca/help.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/docs/ca/help.html?rev=233312r1=233311r2=233312view=diff
==
--- lucene/nutch/trunk/docs/ca/help.html (original)
+++ lucene/nutch/trunk/docs/ca/help.html Thu Aug 18 05:17:22 2005
@@ -19,8 +19,12 @@
 /style
 link type=image/x-icon href=../img/favicon.ico rel=icon
 link type=image/x-icon href=../img/favicon.ico rel=shortcut icon
+script type=text/javascript
+!--
+function queryfocus() { document.search.query.focus(); }
+// --/script
 /head
-body
+body onLoad=queryfocus();
 !--This file is automatically generated.  Do not edit!--
 table cellspacing=0 cellpadding=0 border=0 width=635
 tr

Modified: lucene/nutch/trunk/docs/ca/search.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/docs/ca/search.html?rev=233312r1=233311r2=233312view=diff
==
--- lucene/nutch/trunk/docs/ca/search.html (original)
+++ lucene/nutch/trunk/docs/ca/search.html Thu Aug 18 05:17:22 2005
@@ -19,8 +19,12 @@
 /style
 link type=image/x-icon href=../img/favicon.ico rel=icon
 link type=image/x-icon href=../img/favicon.ico rel=shortcut icon
+script type=text/javascript
+!--
+function queryfocus() { document.search.query.focus(); }
+// --/script
 /head
-body
+body onLoad=queryfocus();
 !--This file is automatically generated.  Do not edit!--
 table cellspacing=0 cellpadding=0 border=0 width=635
 tr

Modified: lucene/nutch/trunk/docs/de/about.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/docs/de/about.html?rev=233312r1=233311r2=233312view=diff
==
--- lucene/nutch/trunk/docs/de/about.html (original)
+++ lucene/nutch/trunk/docs/de/about.html Thu Aug 18 05:17:22 2005
@@ -19,8 +19,12 @@
 /style
 link type=image/x-icon href=../img/favicon.ico rel=icon
 link type=image/x-icon href=../img/favicon.ico rel=shortcut icon
+script type=text/javascript
+!--
+function

svn commit: r233140 - in /lucene/nutch/trunk: site/credits.html site/credits.pdf src/site/src/documentation/content/xdocs/credits.xml

2005-08-17 Thread jerome
Author: jerome
Date: Wed Aug 17 02:39:29 2005
New Revision: 233140

URL: http://svn.apache.org/viewcvs?rev=233140view=rev
Log:
First commit test - Add myself to the list of committers

Modified:
lucene/nutch/trunk/site/credits.html
lucene/nutch/trunk/site/credits.pdf
lucene/nutch/trunk/src/site/src/documentation/content/xdocs/credits.xml

Modified: lucene/nutch/trunk/site/credits.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/site/credits.html?rev=233140r1=233139r2=233140view=diff
==
--- lucene/nutch/trunk/site/credits.html (original)
+++ lucene/nutch/trunk/site/credits.html Wed Aug 17 02:39:29 2005
@@ -201,6 +201,10 @@
 liMike Cafarella/li
   
 li
+a href=http://www.frutch.org/;Jeacute;rocirc;me Charron/a
+/li
+  
+li
 a href=http://www.nutch.org/blog/cutting.html;Doug Cutting/a
 /li
   
@@ -214,7 +218,7 @@
 /div
 
 
-a name=N1002A/aa name=Friends/a
+a name=N1002F/aa name=Friends/a
 h2 class=h3Friends/h2
 div class=section
 ul
@@ -245,7 +249,7 @@
 /div
 
 
-a name=N10074/aa name=Sponsors/a
+a name=N10079/aa name=Sponsors/a
 h2 class=h3Sponsors/h2
 div class=section
 ul

Modified: lucene/nutch/trunk/site/credits.pdf
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/site/credits.pdf?rev=233140r1=233139r2=233140view=diff
==
--- lucene/nutch/trunk/site/credits.pdf (original)
+++ lucene/nutch/trunk/site/credits.pdf Wed Aug 17 02:39:29 2005
@@ -58,10 +58,10 @@
 
 endobj
 14 0 obj
- /Length 2181 /Filter [ /ASCII85Decode /FlateDecode ]
+ /Length 2288 /Filter [ /ASCII85Decode /FlateDecode ]
  
 stream
-Gatm=968iGAII3Ya:;i#YfhoOM/V)S]5Y)Y3kI`l5jUFP$%=LP*?^Uf69OMYp)[EMAIL 
PROTECTED]:^b0dHd)GtMr)-^OfoC3iS2o/87PfD^,B+9JG'FY-@([EMAIL 
PROTECTED];oqY/ejPUi@@iL!tqemI#,'u1B=:]n$76+QB%KCD,TDs$ZJI,]C9!L(O]IBprnI_Kj(L[lSq,;OPn;\3M%tbZ`VMH;JG1E/iiU5n(FG]q:#/T:)*Zh0uYAL6)[EMAIL
 PROTECTED]*0K#D`cBALg]R9eV:[EMAIL 
PROTECTED](b]W;@EU`1)n,3k_FPKLr9\`u^A;6^@S?7]YoAfZBDRZs9D)#DPBsp'PN7-8Pg`Y;9,*!`$*qb^%!Hq(OBEPH$E,3ecs0]1_8%dh;u[EPUV8PA%ct[EMAIL
 PROTECTED],[EMAIL 
PROTECTED])(91nb1f'UPsh.:QgcmnI124ob902[%qe8M!b$#YO+14odl.qqQ?$D+8jAI-:\k-qMW*_)b;?([EMAIL
 PROTECTED]j5Y60#i:7HO=qseDCq/lejK^O?'6b[EMAIL 
PROTECTED],hKY-UP1-c\#u6Yq2o7L)i(j4MeiQg#(ORZUYuH!pC479up((XAY-EXo8!iPkqM0]H+TsM[HjHH0[a#r+8K7#?CBELoslU#jHR62t,0hui;gC*n[KEUl/fgkT+'mTOTPsscQlriB6t3@'D[r?+q,`c\Q;([EMAIL
 PROTECTED]'qDc$WTk--ji/Z8A2W\!uKD,RlnelZ`CXD/[EMAIL 
PROTECTED],th6Q*'p!jjQtVsQcl$ikl8:r.l0)YZ+d88_bH6u#81#?c,iENVKYlptAqnegF[kUl5K/[J+,[EMAIL
 PROTECTED],9;[EMAIL PROTECTED]@[TMK;H!Wsn!9aL#Hk!,L/?[EMAIL 
PROTECTED]@3;R*)G3V-*Z_,3-.;bSLt%EB:lG;Lc/:L;l'1OPhGP/%9.nMl,t=5;9NWH[hdYTpGJOq-MEgFk+0D^/in?c%%dUmtRa=k9aAh?5(W`o,W^ipu.*4O*HQ[b8%3*U3-2hlF1MnqS.B1NJR:C/^pQ`;([EMAIL
 
PROTECTED]95;VWt[WZ$b#jF\160K4MtI\j?*4CITY]]g4r15)-,B#bmG3K!nTLf*/!=,;81bS'[EMAIL
 PROTECTED]:D=8\E?lce%+4!tcIJSCFDI8d]c7=k*\DQBQHh.jDK8#BWISlb^VJR$`V7/[EMAIL 
PROTECTED]C\PWgo+/[EMAIL PROTECTED]@q[EMAIL 
PROTECTED][1NeZ:.W5:6j3IIMiu:SMRB3,Ka(FR:m$?lWIbsJfeP5D:PMCi*/#'pg_9F#m*:p9Q[EMAIL
 PROTECTED]/671fcgaq,87rJMqb:@:MKLCUjU[b+A'qdIh*'--Hn4Rr]ibVQ):b;[EMAIL 
PROTECTED],R\EkK'#CAtfA6K)YaVKHo:#a4g;L]SuD5unu\9#_9t3)CXUeoY^Y#G1*`!FCAd]SegE8@RkGH;WnQj5pgnPP?5$S*YCZQ;__eY4n?1n!t1mqK6fUENR-V'o6i5I0e5SJ1]-7Q9grD.AIq*dM#VJnmDkS?C]Fs3\G60KbEFXk?#,CVZK;.V`q;d4U1dNg6j!rk,^m,Z=?pTDEXJ-/q?DNk!:7/[EMAIL
 PROTECTED]`-Gb+fF'bACIkT-!_g!'[EMAIL 
PROTECTED]q^VcnirDE,LJDUA=UGeg!0,4VEu;t8a/KU/q9+d)cmY:rb(Zpi3^L,uQgF5p%U7FNVi:tBeTE*e(rc)rHhQA\P'9,[EMAIL
 
PROTECTED]M#as'kM$%2-W^a7JO?,qXk*]D+9bFA5CGdW[QT3EZggVRtUORC.PdRS[YTNJu/rc)Y9+-\sm*(=uM8W3`Q17*gM]ETCVCV_UKBn,/92_S=D=r/dA5F(lZK(HJWsnb!j8B2I*W!KCQURGhbfF?7m3:1pN12.IlPqjNc.3jJ_EJ/[EMAIL
 PROTECTED]
+Gatm=Bci#:XAWTJ(Yt%ne1![EMAIL 
PROTECTED]7[C1qt/sg.%9W%d#;R+io5E1IcX_^[[EMAIL 
PROTECTED]'=a[K^7:?_;ArHUm\(3X\ebIg/Q6]=)aF*U1qTHIbrM)PVIB0A(q0h'#S2l61S8mR;rUj_Y,;`cVi1I[q00`'V[K[5=M?KPhQ+6X%jJf!!Jb7'@L]_d(3$36'[EMAIL
 
PROTECTED])BL_ED[Ih9CD:?/4m?Y#+\SePa_G^a?ClN`3or4rt_,5WYPF.cL:dc+M[fCg-%fBX[UXn'TH8!lYSnDa`'o*q)4Up+=IQ1OJ`Uf/RkI]Qq?KU!O^:(@qFoLlRe2ojIeR\-al\P,F*;C]4HJh3[!4?1I^kZalD*5!;WQ[WGo+83!I,+PA8'5VD2n??bZF8lpf'[EMAIL
 PROTECTED];s+LNF]5N,bEpA%lWE%4'*;U9#aSi'm[-.5A-H6bHp#:4CN1M*1*Oe6([EMAIL 
PROTECTED]O'aUO99ZN_3.''dc4PBBuq`qn6t_:[EMAIL 
PROTECTED])I[*DY?WB8:8lF:3(Q'Eiu(0eeh@(3ncMFsO/?m#7-jd.A!.o/\F!+)7QU'eBEoFi,j15-SRk%Xn[;O6YU7cL%:X$o0FC[EMAIL
 PROTECTED])Hi$q79(9)[EMAIL PROTECTED]oDaOf/AeP;+Q:UM3n8WLlFiYi9Jk:[EMAIL 
PROTECTED]nnQt)*':[EMAIL 
PROTECTED])UA7`+q;!D$$(MGtk\^m]L(L`9iOofINi5[7i?'^H!G*TlTQ\8;LYn]RK`Y;#C9lhP[4PdoFkJHKGO)OMJ\d]`HcrHSXjlXoRTL)#?RJWP7NZYS67KVK-2E`NM.goZh\C0goCL!bgW\:(kN.pR%DA/s+IG:CG4]f0a;[EMAIL
 PROTECTED]:A\8cbcoQ99[KFo4!!j(LLt`GKgh0CgeI;Wp\!9XAB!XF*/Y=2?X/0+T)[EMAIL 
PROTECTED]:h2m(CJZj30/Q2]IVn5PT1/.,WIDUY3r`[(+3=D?He3eDc2J=AB0%[9=V%je_q]s;V-,Ei2ZTHI+(;G`V#^809@;6``JRCf^rg(7)O4upqBO\A-7ftiRkN`:PIB0B_n#6Z$U6t=\HKlhO/nERHl\D[kHnDV,5=`h:O\KR:+hn.pR;MU4Hp5dZH