svn commit: r1335436 - in /nutch/branches/nutchgora: ./ conf/ src/bin/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/

lewismc Tue, 08 May 2012 04:53:57 -0700

Author: lewismc
Date: Tue May  8 11:48:00 2012
New Revision: 1335436

URL: http://svn.apache.org/viewvc?rev=1335436&view=rev
Log:
Commit to address NUTCH-1349 and update to CHANGES.txt


Modified:
    nutch/branches/nutchgora/CHANGES.txt
    nutch/branches/nutchgora/conf/log4j.properties
    nutch/branches/nutchgora/src/bin/nutch
    nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/WebTableReader.java
    nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherJob.java
    nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/IndexerJob.java
    nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java

Modified: nutch/branches/nutchgora/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/nutchgora/CHANGES.txt?rev=1335436&r1=1335435&r2=1335436&view=diff
==============================================================================
--- nutch/branches/nutchgora/CHANGES.txt (original)
+++ nutch/branches/nutchgora/CHANGES.txt Tue May  8 11:48:00 2012
@@ -1,11 +1,14 @@
 Nutch Change Log
 
 Release nutchgora - Current Development
+
+* NUTCH-1349 Make batchId explcit within debug logging and improve CLI 
(lewismc + ferdy)
+
 * NUTCH-1352 Improve regex urlfilters/normalizers synchronization (ferdy)
 
 * NUTCH-1356 ParseUtil use ExecutorService instead of manually thread 
handling. (ferdy)
 
-* NUTCH-1355 nutchgora Configure minimum throughput for fetcher
+* NUTCH-1355 nutchgora Configure minimum throughput for fetcher (ferdy)
 
 * NUTCH-1354 nutchgora support fetcher.queue.depth.multiplier property (ferdy)
 

Modified: nutch/branches/nutchgora/conf/log4j.properties
URL: 
http://svn.apache.org/viewvc/nutch/branches/nutchgora/conf/log4j.properties?rev=1335436&r1=1335435&r2=1335436&view=diff
==============================================================================
--- nutch/branches/nutchgora/conf/log4j.properties (original)
+++ nutch/branches/nutchgora/conf/log4j.properties Tue May  8 11:48:00 2012
@@ -26,15 +26,17 @@ log4j.threshhold=ALL
 #special logging requirements for some commandline tools
 log4j.logger.org.apache.nutch.crawl.Crawl=INFO,cmdstdout
 log4j.logger.org.apache.nutch.crawl.InjectorJob=INFO,cmdstdout
+log4j.logger.org.apache.nutch.host.HostInjectorJob=INFO,cmdstdout
 log4j.logger.org.apache.nutch.crawl.GeneratorJob=INFO,cmdstdout
 log4j.logger.org.apache.nutch.crawl.DbUpdaterJob=INFO,cmdstdout
+log4j.logger.org.apache.nutch.host.HostDbUpdateJob=INFO,cmdstdout
 log4j.logger.org.apache.nutch.fetcher.FetcherJob=INFO,cmdstdout
 log4j.logger.org.apache.nutch.parse.ParserJob=INFO,cmdstdout
 log4j.logger.org.apache.nutch.indexer.IndexerJob=INFO,cmdstdout
 log4j.logger.org.apache.nutch.indexer.solr.SolrIndexerJob=INFO,cmdstdout
 log4j.logger.org.apache.nutch.indexer.DeleteDuplicates=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.IndexMerger=INFO,cmdstdout
 log4j.logger.org.apache.nutch.crawl.WebTableReader=INFO,cmdstdout
+log4j.logger.org.apache.nutch.host.HostDbReader=INFO,cmdstdout
 
 log4j.logger.org.apache.nutch=INFO
 log4j.logger.org.apache.hadoop=WARN

Modified: nutch/branches/nutchgora/src/bin/nutch
URL: 
http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/bin/nutch?rev=1335436&r1=1335435&r2=1335436&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/bin/nutch (original)
+++ nutch/branches/nutchgora/src/bin/nutch Tue May  8 11:48:00 2012
@@ -49,11 +49,14 @@ if [ $# = 0 ]; then
   echo "where COMMAND is one of:"
 # echo " crawl one-step crawler for intranets"
   echo " inject                inject new urls into the database"
+  echo " hostinject     creates or updates an existing host table from a text 
file"
   echo " generate      generate new segments to fetch from crawl db"
   echo " fetch                 fetch URLs marked during generate"
   echo " parse                 parse URLs marked during fetch"
   echo " updatedb      update web table after parsing"
+  echo " updatehostdb   update host table after parsing"
   echo " readdb        read/dump records from page database"
+  echo " readhostdb     display entries from the hostDB"
   echo " solrindex     run the solr indexer on parsed segments and linkdb"
   echo " solrdedup     remove duplicates from solr"
   echo " plugin        load a plugin and run one of its classes main()"
@@ -184,6 +187,8 @@ if [ "$COMMAND" = "crawl" ] ; then
 CLASS=org.apache.nutch.crawl.Crawler
 elif [ "$COMMAND" = "inject" ] ; then
 CLASS=org.apache.nutch.crawl.InjectorJob
+elif [ "$COMMAND" = "hostinject" ] ; then
+CLASS=org.apache.nutch.host.HostInjectorJob
 elif [ "$COMMAND" = "generate" ] ; then
 CLASS=org.apache.nutch.crawl.GeneratorJob
 elif [ "$COMMAND" = "fetch" ] ; then
@@ -192,8 +197,12 @@ elif [ "$COMMAND" = "parse" ] ; then
 CLASS=org.apache.nutch.parse.ParserJob
 elif [ "$COMMAND" = "updatedb" ] ; then
 CLASS=org.apache.nutch.crawl.DbUpdaterJob
+elif [ "$COMMAND" = "updatehostdb" ] ; then
+CLASS=org.apache.nutch.host.HostDbUpdateJob
 elif [ "$COMMAND" = "readdb" ] ; then
 CLASS=org.apache.nutch.crawl.WebTableReader
+elif [ "$COMMAND" = "readhostdb" ] ; then
+CLASS=org.apache.nutch.host.HostDbReader
 elif [ "$COMMAND" = "solrindex" ] ; then
 CLASS=org.apache.nutch.indexer.solr.SolrIndexerJob
 elif [ "$COMMAND" = "solrdedup" ] ; then

Modified: 
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/WebTableReader.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/WebTableReader.java?rev=1335436&r1=1335435&r2=1335436&view=diff
==============================================================================
--- 
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/WebTableReader.java 
(original)
+++ 
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/WebTableReader.java 
Tue May  8 11:48:00 2012
@@ -419,21 +419,17 @@ public class WebTableReader extends Nutc
   public int run(String[] args) throws Exception {
     if (args.length < 1) {
       System.err
-          .println("Usage: WebTableReader (-stats | -url [url] | -dump 
<out_dir> [-regex regex]) [-crawlId <id>] [-content] [-headers] [-links] 
[-text]");
-      System.err.println("\t-crawlId <id>\t the id to prefix the schemas to 
operate on, (default: storage.crawl.id)");
-      System.err
-          .println("\t-stats [-sort] \tprint overall statistics to 
System.out");
-      System.err.println("\t\t[-sort]\tlist status sorted by host");
-      System.err
-          .println("\t-url <url>\tprint information on <url> to System.out");
-      System.err
-          .println("\t-dump <out_dir> [-regex regex]\tdump the webtable to a 
text file in <out_dir>");
-      System.err.println("\t\t-content\tdump also raw content");
-      System.err.println("\t\t-headers\tdump protocol headers");
-      System.err.println("\t\t-links\tdump links");
-      System.err.println("\t\t-text\tdump extracted text");
-      System.err
-          .println("\t\t[-regex]\tfilter on the URL of the webtable entry");
+          .println("Usage: WebTableReader (-stats | -url [url] | -dump 
<out_dir> [-regex regex]) \n \t \t      [-crawlId <id>] [-content] [-headers] 
[-links] [-text]");
+      System.err.println("    -crawlId <id>  - the id to prefix the schemas to 
operate on, \n \t \t     (default: storage.crawl.id)");
+      System.err.println("    -stats [-sort] - print overall statistics to 
System.out");
+      System.err.println("    [-sort]        - list status sorted by host");
+      System.err.println("    -url <url>     - print information on <url> to 
System.out");
+      System.err.println("    -dump <out_dir> [-regex regex] - dump the 
webtable to a text file in \n \t \t     <out_dir>");
+      System.err.println("    -content       - dump also raw content");
+      System.err.println("    -headers       - dump protocol headers");
+      System.err.println("    -links         - dump links");
+      System.err.println("    -text          - dump extracted text");
+      System.err.println("    [-regex]       - filter on the URL of the 
webtable entry");
       return -1;
     }
     String param = null;

Modified: 
nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherJob.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherJob.java?rev=1335436&r1=1335435&r2=1335436&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherJob.java 
(original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherJob.java 
Tue May  8 11:48:00 2012
@@ -110,7 +110,7 @@ public class FetcherJob extends NutchToo
       Utf8 mark = Mark.GENERATE_MARK.checkMark(page);
       if (!NutchJob.shouldProcess(mark, batchId)) {
         if (LOG.isDebugEnabled()) {
-          LOG.debug("Skipping " + TableUtil.unreverseUrl(key) + "; different 
batch id");
+          LOG.debug("Skipping " + TableUtil.unreverseUrl(key) + "; different 
batch id (" + mark + ")");
         }
         return;
       }
@@ -265,12 +265,12 @@ public class FetcherJob extends NutchToo
     String batchId;
 
     String usage = "Usage: FetcherJob (<batchId> | -all) [-crawlId <id>] " +
-      "[-threads N] [-parse] [-resume] [-numTasks N]\n" +
-      "\tbatchId\tcrawl identifier returned by Generator, or -all for all 
generated batchId-s\n" +
-      "\t-crawlId <id>\t the id to prefix the schemas to operate on, (default: 
storage.crawl.id)\n" +
-      "\t-threads N\tnumber of fetching threads per task\n" +
-      "\t-resume\tresume interrupted job\n" +
-      "\t-numTasks N\tif N > 0 then use this many reduce tasks for fetching 
(default: mapred.map.tasks)";
+      "[-threads N] [-parse] \n \t \t  [-resume] [-numTasks N]\n" +
+      "    <batchId>     - crawl identifier returned by Generator, or -all for 
all \n \t \t    generated batchId-s\n" +
+      "    -crawlId <id> - the id to prefix the schemas to operate on, \n \t 
\t    (default: storage.crawl.id)\n" +
+      "    -threads N    - number of fetching threads per task\n" +
+      "    -resume       - resume interrupted job\n" +
+      "    -numTasks N   - if N > 0 then use this many reduce tasks for 
fetching \n \t \t    (default: mapred.map.tasks)";
 
     if (args.length == 0) {
       System.err.println(usage);

Modified: 
nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/IndexerJob.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/IndexerJob.java?rev=1335436&r1=1335435&r2=1335436&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/IndexerJob.java 
(original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/IndexerJob.java 
Tue May  8 11:48:00 2012
@@ -80,7 +80,7 @@ public abstract class IndexerJob extends
       if (!batchId.equals(REINDEX)) {
         if (!NutchJob.shouldProcess(mark, batchId)) {
           if (LOG.isDebugEnabled()) {
-            LOG.debug("Skipping " + TableUtil.unreverseUrl(key) + "; different 
batch id");
+            LOG.debug("Skipping " + TableUtil.unreverseUrl(key) + "; different 
batch id (" + mark + ")");
           }
           return;
         }

Modified: 
nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java?rev=1335436&r1=1335435&r2=1335436&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java 
(original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java Tue 
May  8 11:48:00 2012
@@ -269,11 +269,11 @@ public class ParserJob extends NutchTool
 
     if (args.length < 1) {
       System.err.println("Usage: ParserJob (<batchId> | -all) [-crawlId <id>] 
[-resume] [-force]");
-      System.err.println("\tbatchId\tsymbolic batch ID created by Generator");
-      System.err.println("\t-crawlId <id>\t the id to prefix the schemas to 
operate on, (default: storage.crawl.id)");
-      System.err.println("\t-all\tconsider pages from all crawl jobs");
-      System.err.println("-resume\tresume a previous incomplete job");
-      System.err.println("-force\tforce re-parsing even if a page is already 
parsed");
+      System.err.println("    <batchId>     - symbolic batch ID created by 
Generator");
+      System.err.println("    -crawlId <id> - the id to prefix the schemas to 
operate on, \n \t \t    (default: storage.crawl.id)");
+      System.err.println("    -all          - consider pages from all crawl 
jobs");
+      System.err.println("    -resume       - resume a previous incomplete 
job");
+      System.err.println("    -force        - force re-parsing even if a page 
is already parsed");
       return -1;
     }
     for (int i = 0; i < args.length; i++) {

svn commit: r1335436 - in /nutch/branches/nutchgora: ./ conf/ src/bin/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/

Reply via email to