svn commit: r382912 - in /lucene/nutch/trunk/src/java/org/apache/nutch: crawl/ fetcher/ indexer/ parse/ plugin/ searcher/ segment/

2006-03-03 Thread cutting
Author: cutting
Date: Fri Mar  3 11:05:41 2006
New Revision: 382912

URL: http://svn.apache.org/viewcvs?rev=382912view=rev
Log:
Undo unintentional changes made in r381751.  Thanks, Jerome, for catching this!

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=382912r1=382911r2=382912view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Fri Mar  3 
11:05:41 2006
@@ -44,11 +44,11 @@
 
 
   /* Perform complete crawling and indexing given a set of root urls. */
-  public static boolean doMain(String args[]) throws Exception {
+  public static void main(String args[]) throws Exception {
 if (args.length  1) {
   System.out.println
 (Usage: Crawl urlDir [-dir d] [-threads n] [-depth i] [-topN N]);
-  return false;
+  return;
 }
 
 Configuration conf = NutchConfiguration.create();
@@ -122,22 +122,5 @@
 new IndexMerger(fs, fs.listFiles(indexes), index, tmpDir, job).merge();
 
 LOG.info(crawl finished:  + dir);
-
-return true;
-  }
-
-  /**
-   * main() wrapper that returns proper exit status
-   */
-  public static void main(String[] args) {
-Runtime rt = Runtime.getRuntime();
-try {
-  boolean status = doMain(args);
-  rt.exit(status ? 0 : 1);
-}
-catch (Exception e) {
-  LOG.log(Level.SEVERE, error, caught Exception in main(), e);
-  rt.exit(1);
-}
   }
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=382912r1=382911r2=382912view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Fri Mar  3 
11:05:41 2006
@@ -90,31 +90,17 @@
 fs.delete(old);
   }
 
-  public static boolean doMain(String[] args) throws Exception {
+  public static void main(String[] args) throws Exception {
 CrawlDb crawlDb = new CrawlDb(NutchConfiguration.create());
 
 if (args.length  2) {
   System.err.println(Usage: crawldb segment);
-  return false;
+  return;
 }
 
 crawlDb.update(new File(args[0]), new File(args[1]));
-
-return true;
   }
 
-  /**
-   * main() wrapper that returns proper exit status
-   */
-  public static void main(String[] args) {
-Runtime rt = Runtime.getRuntime();
-try {
-  boolean status = doMain(args);
-  rt.exit(status ? 0 : 1);
-}
-catch (Exception e) {
-  LOG.log(Level.SEVERE, error, caught Exception in main(), e);
-  rt.exit(1);
-}
-  }
+
+
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=382912r1=382911r2=382912view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Fri 
Mar  3 11:05:41 2006
@@ -20,7 +20,7 @@
 import java.io.IOException;
 import java.util.Iterator;
 import java.util.TreeMap;
-import java.util.logging.*;
+import java.util.logging.Logger;
 
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.io.LongWritable;
@@ -241,7 +241,7 @@
 JobClient.runJob(job);
   }
 
-  public static boolean doMain(String[] args) throws IOException {
+  public static void main(String[] args) throws IOException {
 CrawlDbReader dbr = new CrawlDbReader();
 
 if (args.length  1) {
@@ -250,7 +250,7 

[Nutch Wiki] Update of PublicServers by GordonMohr

2006-03-03 Thread Apache Wiki
Dear Wiki user,

You have subscribed to a wiki page or wiki category on Nutch Wiki for change 
notification.

The following page has been changed by GordonMohr:
http://wiki.apache.org/nutch/PublicServers

--
  
* [http://www.igougo.com/  IgoUgo] - Travel Search Engine based on Nutch.
  
+   * [http://www.jboss.com/search.jsp?query=httpx=0y=0 jboss homepage] The 
jboss (tm) homepage runs a nutch as homepage search engine.
+ 
* [http://www.jcintersonic.com/ JC Intersonic] uses nutch as its search 
engine.
  
* [http://www.millionpixelsearchpage.com The Million Pixel Search Page] - 
Search engine for Alex Tew's [http://www.milliondollarhomepage.com Million 
Dollar Homepage].
@@ -40, +42 @@

  
* [http://labs.yahoo.com/demo/nutch/ Yahoo Labs] presents their Nutch 
testing platform. (offline as of ~7 December 2004)
  
-   * [http://www.jboss.com/search.jsp?query=httpx=0y=0 jboss homepage] The 
jboss (tm) homepage runs a nutch as homepage search engine.
+   * [http://webharvest.gov Webharvest.gov] offers full-text search of nearly 
100 million resources collected from US Federal Government websites as part of 
the National Archive and Records Administration's 2004 Presidential Term Web 
Harvest
  


svn commit: r382939 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

2006-03-03 Thread cutting
Author: cutting
Date: Fri Mar  3 13:46:21 2006
New Revision: 382939

URL: http://svn.apache.org/viewcvs?rev=382939view=rev
Log:
Upgrade hadoop to latest version with some important mapred bug fixes.

Modified:
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=382939r1=382938r2=382939view=diff
==
Binary files - no diff available.




[Nutch Wiki] Update of PluginCentral by JeromeCharron

2006-03-03 Thread Apache Wiki
Dear Wiki user,

You have subscribed to a wiki page or wiki category on Nutch Wiki for change 
notification.

The following page has been changed by JeromeCharron:
http://wiki.apache.org/nutch/PluginCentral

--
  
  In order to get Nutch to use any of these plugins, you just need to edit your 
conf/nutch-site.xml file and add the name of the plugin to the list of 
plugin.includes.
  
-  * index-basic - Adds url, content and anchor fields to the index.
+  * '''index-basic''' - Adds url, content and anchor fields to the index.
-  * index-more - Adds date, content-length, contentType, primaryType and 
subtype fields to the index.
+  * '''index-more''' - Adds date, content-length, contentType, primaryType and 
subtype fields to the index.
-  * languageidentifier - Adds a lang field to the index and allows you to 
query against it.
+  * '''languageidentifier''' - Adds a lang field to the index and allows you 
to query against it.
+  * '''microformats-reltag''' - Adds [http://www.microformats.org/wiki/Rel-Tag 
rel-tag] fields to the index and runs queries against them.
-  * [wiki:OntologyPlugin ontology] - Helps refine queries based on owl files.
+  * '''[wiki:OntologyPlugin ontology]''' - Helps refine queries based on owl 
files.
-  * parse-ext - A wrapper that invokes external command to do real parsing job.
+  * '''parse-ext''' - A wrapper that invokes external command to do real 
parsing job.
-  * parse-html - Parses HTML documents
+  * '''parse-html''' - Parses HTML documents
-  * parse-js - Parses Java``Script
+  * '''parse-js''' - Parses Java``Script
-  * parse-mp3 - Parses MP3s
+  * '''parse-mp3''' - Parses MP3s
-  * parse-msexcel - Parses MS Excel documents
+  * '''parse-msexcel''' - Parses MS Excel documents
-  * parse-mspowerpoint - Parses MS Powerpoint documents
+  * '''parse-mspowerpoint''' - Parses MS Powerpoint documents
-  * parse-msword - Parses MS Word documents
+  * '''parse-msword''' - Parses MS Word documents
-  * parse-pdf - Parses PDFs
+  * '''parse-pdf''' - Parses PDFs
-  * parse-rss - Parses RSS feeds
+  * '''parse-rss''' - Parses RSS feeds
-  * parse-rtf - Parses RTF files
+  * '''parse-rtf''' - Parses RTF files
-  * parse-swf - Parses Flash SWF files
+  * '''parse-swf''' - Parses Flash SWF files
-  * parse-text - Parses text documents
+  * '''parse-text''' - Parses text documents
-  * protocol-file - Retreives documents from the filesystem
+  * '''protocol-file''' - Retreives documents from the filesystem
-  * protocol-ftp - Retreives documents through ftp
+  * '''protocol-ftp''' - Retreives documents through ftp
-  * protocol-http - Retreives documents through http
+  * '''protocol-http''' - Retreives documents through http
-  * protocol-httpclient - Retreives documents through http and https
+  * '''protocol-httpclient''' - Retreives documents through http and https
-  * query-basic - Runs queries against content, url and anchor fields
+  * '''query-basic''' - Runs queries against content, url and anchor fields
-  * query-more - Runs queries against date, content-length, contentType, 
primaryType and subType fields.
+  * '''query-more''' - Runs queries against date, content-length, contentType, 
primaryType and subType fields.
-  * query-site - Runs queries against site field
+  * '''query-site''' - Runs queries against site field
-  * query-url - Runs queries against url field.
+  * '''query-url''' - Runs queries against url field.
-  * urlfilter-prefix
+  * '''urlfilter-prefix'''
-  * urlfilter-regex
+  * '''urlfilter-regex'''
  
  == Plugins You can Download ==
  


[Nutch Wiki] Update of bin/nutch crawl by fha

2006-03-03 Thread Apache Wiki
Dear Wiki user,

You have subscribed to a wiki page or wiki category on Nutch Wiki for change 
notification.

The following page has been changed by fha:
http://wiki.apache.org/nutch/bin/nutch_crawl

--
  
  [-showThreadID]
  
- [-depth i]
+ [-depth i]: You can tell Nutch how deep it should crawl. If you don’t tell 
Nutch a value, it takes 3 as his standard parameter. 
+ For example if you say –depth 1, Nutch would only index the first level. 
Only if you say –depth 2 (or more) Nutch would make a link follow.
+  
  
- [-dir d] 
+ [-dir d]: You can choose the directory, where Nutch should save the index.
+ If you don’t choose a directory Nutch would create a own directory in the 
directory where you started the crawl.
+ Example of a –dir parameter: -dir /usr/local/index/ 
  
- [-threads n]
+ [-threads n]: You can choose, how many threads Nutch would use.
  
  -local
  


svn commit: r382981 - in /lucene/nutch/trunk/src/plugin: ./ analysis-de/ analysis-fr/ clustering-carrot2/ creativecommons/ index-basic/ index-more/ languageidentifier/ lib-http/ lib-parsems/ microform

2006-03-03 Thread jerome
Author: jerome
Date: Fri Mar  3 16:26:54 2006
New Revision: 382981

URL: http://svn.apache.org/viewcvs?rev=382981view=rev
Log:
Plugins now assumes that the core is already builded when building nutch

Modified:
lucene/nutch/trunk/src/plugin/analysis-de/build.xml
lucene/nutch/trunk/src/plugin/analysis-fr/build.xml
lucene/nutch/trunk/src/plugin/build-plugin.xml
lucene/nutch/trunk/src/plugin/build.xml
lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml
lucene/nutch/trunk/src/plugin/creativecommons/build.xml
lucene/nutch/trunk/src/plugin/index-basic/build.xml
lucene/nutch/trunk/src/plugin/index-more/build.xml
lucene/nutch/trunk/src/plugin/languageidentifier/build.xml
lucene/nutch/trunk/src/plugin/lib-http/build.xml
lucene/nutch/trunk/src/plugin/lib-parsems/build.xml
lucene/nutch/trunk/src/plugin/microformats-reltag/build.xml
lucene/nutch/trunk/src/plugin/ontology/build.xml
lucene/nutch/trunk/src/plugin/parse-ext/build.xml
lucene/nutch/trunk/src/plugin/parse-html/build.xml
lucene/nutch/trunk/src/plugin/parse-js/build.xml
lucene/nutch/trunk/src/plugin/parse-mp3/build.xml
lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml
lucene/nutch/trunk/src/plugin/parse-msword/build.xml
lucene/nutch/trunk/src/plugin/parse-pdf/build.xml
lucene/nutch/trunk/src/plugin/parse-rss/build.xml
lucene/nutch/trunk/src/plugin/parse-swf/build.xml
lucene/nutch/trunk/src/plugin/parse-text/build.xml
lucene/nutch/trunk/src/plugin/parse-zip/build.xml
lucene/nutch/trunk/src/plugin/protocol-file/build.xml
lucene/nutch/trunk/src/plugin/protocol-ftp/build.xml
lucene/nutch/trunk/src/plugin/protocol-http/build.xml
lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml
lucene/nutch/trunk/src/plugin/query-basic/build.xml
lucene/nutch/trunk/src/plugin/query-more/build.xml
lucene/nutch/trunk/src/plugin/query-site/build.xml
lucene/nutch/trunk/src/plugin/query-url/build.xml
lucene/nutch/trunk/src/plugin/urlfilter-prefix/build.xml
lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml

Modified: lucene/nutch/trunk/src/plugin/analysis-de/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/analysis-de/build.xml?rev=382981r1=382980r2=382981view=diff
==
--- lucene/nutch/trunk/src/plugin/analysis-de/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/analysis-de/build.xml Fri Mar  3 16:26:54 2006
@@ -1,12 +1,10 @@
 ?xml version=1.0?
 
-project name=analysis-de default=jar
+project name=analysis-de default=jar-core
 
   import file=../build-plugin.xml/
-
-  !-- Build compilation dependencies --
+  
   target name=deps-jar
-ant target=compile-core inheritall=false dir=${nutch.root}/
 ant target=jar inheritall=false dir=../lib-lucene-analyzers/
   /target
 

Modified: lucene/nutch/trunk/src/plugin/analysis-fr/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/analysis-fr/build.xml?rev=382981r1=382980r2=382981view=diff
==
--- lucene/nutch/trunk/src/plugin/analysis-fr/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/analysis-fr/build.xml Fri Mar  3 16:26:54 2006
@@ -1,12 +1,11 @@
 ?xml version=1.0?
 
-project name=analysis-fr default=jar
+project name=analysis-fr default=jar-core
 
   import file=../build-plugin.xml/
 
   !-- Build compilation dependencies --
   target name=deps-jar
-ant target=compile-core inheritall=false dir=${nutch.root}/
 ant target=jar inheritall=false dir=../lib-lucene-analyzers/
   /target
 

Modified: lucene/nutch/trunk/src/plugin/build-plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build-plugin.xml?rev=382981r1=382980r2=382981view=diff
==
--- lucene/nutch/trunk/src/plugin/build-plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/build-plugin.xml Fri Mar  3 16:26:54 2006
@@ -96,9 +96,14 @@
  source=${javac.version}
  deprecation=${javac.deprecation}
   classpath refid=classpath/
-/javac
+/javac
   /target
 
+  target name=compile-core
+ant target=compile-core inheritall=false dir=${nutch.root}/
+ant target=compile/
+  /target
+  
   !-- == --
   !-- Make plugin .jar   --
   !-- == --
@@ -109,6 +114,13 @@
   jarfile=${build.dir}/${name}.jar
   basedir=${build.classes}
 /
+  /target
+
+  target name=jar-core depends=compile-core
+jar
+jarfile=${build.dir}/${name}.jar
+basedir=${build.classes}
+/
   /target
 
   !--