svn commit: r382912 - in /lucene/nutch/trunk/src/java/org/apache/nutch: crawl/ fetcher/ indexer/ parse/ plugin/ searcher/ segment/
Author: cutting Date: Fri Mar 3 11:05:41 2006 New Revision: 382912 URL: http://svn.apache.org/viewcvs?rev=382912view=rev Log: Undo unintentional changes made in r381751. Thanks, Jerome, for catching this! Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=382912r1=382911r2=382912view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Fri Mar 3 11:05:41 2006 @@ -44,11 +44,11 @@ /* Perform complete crawling and indexing given a set of root urls. */ - public static boolean doMain(String args[]) throws Exception { + public static void main(String args[]) throws Exception { if (args.length 1) { System.out.println (Usage: Crawl urlDir [-dir d] [-threads n] [-depth i] [-topN N]); - return false; + return; } Configuration conf = NutchConfiguration.create(); @@ -122,22 +122,5 @@ new IndexMerger(fs, fs.listFiles(indexes), index, tmpDir, job).merge(); LOG.info(crawl finished: + dir); - -return true; - } - - /** - * main() wrapper that returns proper exit status - */ - public static void main(String[] args) { -Runtime rt = Runtime.getRuntime(); -try { - boolean status = doMain(args); - rt.exit(status ? 0 : 1); -} -catch (Exception e) { - LOG.log(Level.SEVERE, error, caught Exception in main(), e); - rt.exit(1); -} } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=382912r1=382911r2=382912view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Fri Mar 3 11:05:41 2006 @@ -90,31 +90,17 @@ fs.delete(old); } - public static boolean doMain(String[] args) throws Exception { + public static void main(String[] args) throws Exception { CrawlDb crawlDb = new CrawlDb(NutchConfiguration.create()); if (args.length 2) { System.err.println(Usage: crawldb segment); - return false; + return; } crawlDb.update(new File(args[0]), new File(args[1])); - -return true; } - /** - * main() wrapper that returns proper exit status - */ - public static void main(String[] args) { -Runtime rt = Runtime.getRuntime(); -try { - boolean status = doMain(args); - rt.exit(status ? 0 : 1); -} -catch (Exception e) { - LOG.log(Level.SEVERE, error, caught Exception in main(), e); - rt.exit(1); -} - } + + } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=382912r1=382911r2=382912view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Fri Mar 3 11:05:41 2006 @@ -20,7 +20,7 @@ import java.io.IOException; import java.util.Iterator; import java.util.TreeMap; -import java.util.logging.*; +import java.util.logging.Logger; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.io.LongWritable; @@ -241,7 +241,7 @@ JobClient.runJob(job); } - public static boolean doMain(String[] args) throws IOException { + public static void main(String[] args) throws IOException { CrawlDbReader dbr = new CrawlDbReader(); if (args.length 1) { @@ -250,7 +250,7
[Nutch Wiki] Update of PublicServers by GordonMohr
Dear Wiki user, You have subscribed to a wiki page or wiki category on Nutch Wiki for change notification. The following page has been changed by GordonMohr: http://wiki.apache.org/nutch/PublicServers -- * [http://www.igougo.com/ IgoUgo] - Travel Search Engine based on Nutch. + * [http://www.jboss.com/search.jsp?query=httpx=0y=0 jboss homepage] The jboss (tm) homepage runs a nutch as homepage search engine. + * [http://www.jcintersonic.com/ JC Intersonic] uses nutch as its search engine. * [http://www.millionpixelsearchpage.com The Million Pixel Search Page] - Search engine for Alex Tew's [http://www.milliondollarhomepage.com Million Dollar Homepage]. @@ -40, +42 @@ * [http://labs.yahoo.com/demo/nutch/ Yahoo Labs] presents their Nutch testing platform. (offline as of ~7 December 2004) - * [http://www.jboss.com/search.jsp?query=httpx=0y=0 jboss homepage] The jboss (tm) homepage runs a nutch as homepage search engine. + * [http://webharvest.gov Webharvest.gov] offers full-text search of nearly 100 million resources collected from US Federal Government websites as part of the National Archive and Records Administration's 2004 Presidential Term Web Harvest
svn commit: r382939 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
Author: cutting Date: Fri Mar 3 13:46:21 2006 New Revision: 382939 URL: http://svn.apache.org/viewcvs?rev=382939view=rev Log: Upgrade hadoop to latest version with some important mapred bug fixes. Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=382939r1=382938r2=382939view=diff == Binary files - no diff available.
[Nutch Wiki] Update of PluginCentral by JeromeCharron
Dear Wiki user, You have subscribed to a wiki page or wiki category on Nutch Wiki for change notification. The following page has been changed by JeromeCharron: http://wiki.apache.org/nutch/PluginCentral -- In order to get Nutch to use any of these plugins, you just need to edit your conf/nutch-site.xml file and add the name of the plugin to the list of plugin.includes. - * index-basic - Adds url, content and anchor fields to the index. + * '''index-basic''' - Adds url, content and anchor fields to the index. - * index-more - Adds date, content-length, contentType, primaryType and subtype fields to the index. + * '''index-more''' - Adds date, content-length, contentType, primaryType and subtype fields to the index. - * languageidentifier - Adds a lang field to the index and allows you to query against it. + * '''languageidentifier''' - Adds a lang field to the index and allows you to query against it. + * '''microformats-reltag''' - Adds [http://www.microformats.org/wiki/Rel-Tag rel-tag] fields to the index and runs queries against them. - * [wiki:OntologyPlugin ontology] - Helps refine queries based on owl files. + * '''[wiki:OntologyPlugin ontology]''' - Helps refine queries based on owl files. - * parse-ext - A wrapper that invokes external command to do real parsing job. + * '''parse-ext''' - A wrapper that invokes external command to do real parsing job. - * parse-html - Parses HTML documents + * '''parse-html''' - Parses HTML documents - * parse-js - Parses Java``Script + * '''parse-js''' - Parses Java``Script - * parse-mp3 - Parses MP3s + * '''parse-mp3''' - Parses MP3s - * parse-msexcel - Parses MS Excel documents + * '''parse-msexcel''' - Parses MS Excel documents - * parse-mspowerpoint - Parses MS Powerpoint documents + * '''parse-mspowerpoint''' - Parses MS Powerpoint documents - * parse-msword - Parses MS Word documents + * '''parse-msword''' - Parses MS Word documents - * parse-pdf - Parses PDFs + * '''parse-pdf''' - Parses PDFs - * parse-rss - Parses RSS feeds + * '''parse-rss''' - Parses RSS feeds - * parse-rtf - Parses RTF files + * '''parse-rtf''' - Parses RTF files - * parse-swf - Parses Flash SWF files + * '''parse-swf''' - Parses Flash SWF files - * parse-text - Parses text documents + * '''parse-text''' - Parses text documents - * protocol-file - Retreives documents from the filesystem + * '''protocol-file''' - Retreives documents from the filesystem - * protocol-ftp - Retreives documents through ftp + * '''protocol-ftp''' - Retreives documents through ftp - * protocol-http - Retreives documents through http + * '''protocol-http''' - Retreives documents through http - * protocol-httpclient - Retreives documents through http and https + * '''protocol-httpclient''' - Retreives documents through http and https - * query-basic - Runs queries against content, url and anchor fields + * '''query-basic''' - Runs queries against content, url and anchor fields - * query-more - Runs queries against date, content-length, contentType, primaryType and subType fields. + * '''query-more''' - Runs queries against date, content-length, contentType, primaryType and subType fields. - * query-site - Runs queries against site field + * '''query-site''' - Runs queries against site field - * query-url - Runs queries against url field. + * '''query-url''' - Runs queries against url field. - * urlfilter-prefix + * '''urlfilter-prefix''' - * urlfilter-regex + * '''urlfilter-regex''' == Plugins You can Download ==
[Nutch Wiki] Update of bin/nutch crawl by fha
Dear Wiki user, You have subscribed to a wiki page or wiki category on Nutch Wiki for change notification. The following page has been changed by fha: http://wiki.apache.org/nutch/bin/nutch_crawl -- [-showThreadID] - [-depth i] + [-depth i]: You can tell Nutch how deep it should crawl. If you donât tell Nutch a value, it takes 3 as his standard parameter. + For example if you say âdepth 1, Nutch would only index the first level. Only if you say âdepth 2 (or more) Nutch would make a link follow. + - [-dir d] + [-dir d]: You can choose the directory, where Nutch should save the index. + If you donât choose a directory Nutch would create a own directory in the directory where you started the crawl. + Example of a âdir parameter: -dir /usr/local/index/ - [-threads n] + [-threads n]: You can choose, how many threads Nutch would use. -local
svn commit: r382981 - in /lucene/nutch/trunk/src/plugin: ./ analysis-de/ analysis-fr/ clustering-carrot2/ creativecommons/ index-basic/ index-more/ languageidentifier/ lib-http/ lib-parsems/ microform
Author: jerome Date: Fri Mar 3 16:26:54 2006 New Revision: 382981 URL: http://svn.apache.org/viewcvs?rev=382981view=rev Log: Plugins now assumes that the core is already builded when building nutch Modified: lucene/nutch/trunk/src/plugin/analysis-de/build.xml lucene/nutch/trunk/src/plugin/analysis-fr/build.xml lucene/nutch/trunk/src/plugin/build-plugin.xml lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml lucene/nutch/trunk/src/plugin/creativecommons/build.xml lucene/nutch/trunk/src/plugin/index-basic/build.xml lucene/nutch/trunk/src/plugin/index-more/build.xml lucene/nutch/trunk/src/plugin/languageidentifier/build.xml lucene/nutch/trunk/src/plugin/lib-http/build.xml lucene/nutch/trunk/src/plugin/lib-parsems/build.xml lucene/nutch/trunk/src/plugin/microformats-reltag/build.xml lucene/nutch/trunk/src/plugin/ontology/build.xml lucene/nutch/trunk/src/plugin/parse-ext/build.xml lucene/nutch/trunk/src/plugin/parse-html/build.xml lucene/nutch/trunk/src/plugin/parse-js/build.xml lucene/nutch/trunk/src/plugin/parse-mp3/build.xml lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml lucene/nutch/trunk/src/plugin/parse-msword/build.xml lucene/nutch/trunk/src/plugin/parse-pdf/build.xml lucene/nutch/trunk/src/plugin/parse-rss/build.xml lucene/nutch/trunk/src/plugin/parse-swf/build.xml lucene/nutch/trunk/src/plugin/parse-text/build.xml lucene/nutch/trunk/src/plugin/parse-zip/build.xml lucene/nutch/trunk/src/plugin/protocol-file/build.xml lucene/nutch/trunk/src/plugin/protocol-ftp/build.xml lucene/nutch/trunk/src/plugin/protocol-http/build.xml lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml lucene/nutch/trunk/src/plugin/query-basic/build.xml lucene/nutch/trunk/src/plugin/query-more/build.xml lucene/nutch/trunk/src/plugin/query-site/build.xml lucene/nutch/trunk/src/plugin/query-url/build.xml lucene/nutch/trunk/src/plugin/urlfilter-prefix/build.xml lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml Modified: lucene/nutch/trunk/src/plugin/analysis-de/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/analysis-de/build.xml?rev=382981r1=382980r2=382981view=diff == --- lucene/nutch/trunk/src/plugin/analysis-de/build.xml (original) +++ lucene/nutch/trunk/src/plugin/analysis-de/build.xml Fri Mar 3 16:26:54 2006 @@ -1,12 +1,10 @@ ?xml version=1.0? -project name=analysis-de default=jar +project name=analysis-de default=jar-core import file=../build-plugin.xml/ - - !-- Build compilation dependencies -- + target name=deps-jar -ant target=compile-core inheritall=false dir=${nutch.root}/ ant target=jar inheritall=false dir=../lib-lucene-analyzers/ /target Modified: lucene/nutch/trunk/src/plugin/analysis-fr/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/analysis-fr/build.xml?rev=382981r1=382980r2=382981view=diff == --- lucene/nutch/trunk/src/plugin/analysis-fr/build.xml (original) +++ lucene/nutch/trunk/src/plugin/analysis-fr/build.xml Fri Mar 3 16:26:54 2006 @@ -1,12 +1,11 @@ ?xml version=1.0? -project name=analysis-fr default=jar +project name=analysis-fr default=jar-core import file=../build-plugin.xml/ !-- Build compilation dependencies -- target name=deps-jar -ant target=compile-core inheritall=false dir=${nutch.root}/ ant target=jar inheritall=false dir=../lib-lucene-analyzers/ /target Modified: lucene/nutch/trunk/src/plugin/build-plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build-plugin.xml?rev=382981r1=382980r2=382981view=diff == --- lucene/nutch/trunk/src/plugin/build-plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/build-plugin.xml Fri Mar 3 16:26:54 2006 @@ -96,9 +96,14 @@ source=${javac.version} deprecation=${javac.deprecation} classpath refid=classpath/ -/javac +/javac /target + target name=compile-core +ant target=compile-core inheritall=false dir=${nutch.root}/ +ant target=compile/ + /target + !-- == -- !-- Make plugin .jar -- !-- == -- @@ -109,6 +114,13 @@ jarfile=${build.dir}/${name}.jar basedir=${build.classes} / + /target + + target name=jar-core depends=compile-core +jar +jarfile=${build.dir}/${name}.jar +basedir=${build.classes} +/ /target !--