Author: lewismc
Date: Tue May 8 11:48:00 2012
New Revision: 1335436
URL: http://svn.apache.org/viewvc?rev=1335436&view=rev
Log:
Commit to address NUTCH-1349 and update to CHANGES.txt
Modified:
nutch/branches/nutchgora/CHANGES.txt
nutch/branches/nutchgora/conf/log4j.properties
nutch/branches/nutchgora/src/bin/nutch
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/WebTableReader.java
nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherJob.java
nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/IndexerJob.java
nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java
Modified: nutch/branches/nutchgora/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/nutchgora/CHANGES.txt?rev=1335436&r1=1335435&r2=1335436&view=diff
==============================================================================
--- nutch/branches/nutchgora/CHANGES.txt (original)
+++ nutch/branches/nutchgora/CHANGES.txt Tue May 8 11:48:00 2012
@@ -1,11 +1,14 @@
Nutch Change Log
Release nutchgora - Current Development
+
+* NUTCH-1349 Make batchId explcit within debug logging and improve CLI
(lewismc + ferdy)
+
* NUTCH-1352 Improve regex urlfilters/normalizers synchronization (ferdy)
* NUTCH-1356 ParseUtil use ExecutorService instead of manually thread
handling. (ferdy)
-* NUTCH-1355 nutchgora Configure minimum throughput for fetcher
+* NUTCH-1355 nutchgora Configure minimum throughput for fetcher (ferdy)
* NUTCH-1354 nutchgora support fetcher.queue.depth.multiplier property (ferdy)
Modified: nutch/branches/nutchgora/conf/log4j.properties
URL:
http://svn.apache.org/viewvc/nutch/branches/nutchgora/conf/log4j.properties?rev=1335436&r1=1335435&r2=1335436&view=diff
==============================================================================
--- nutch/branches/nutchgora/conf/log4j.properties (original)
+++ nutch/branches/nutchgora/conf/log4j.properties Tue May 8 11:48:00 2012
@@ -26,15 +26,17 @@ log4j.threshhold=ALL
#special logging requirements for some commandline tools
log4j.logger.org.apache.nutch.crawl.Crawl=INFO,cmdstdout
log4j.logger.org.apache.nutch.crawl.InjectorJob=INFO,cmdstdout
+log4j.logger.org.apache.nutch.host.HostInjectorJob=INFO,cmdstdout
log4j.logger.org.apache.nutch.crawl.GeneratorJob=INFO,cmdstdout
log4j.logger.org.apache.nutch.crawl.DbUpdaterJob=INFO,cmdstdout
+log4j.logger.org.apache.nutch.host.HostDbUpdateJob=INFO,cmdstdout
log4j.logger.org.apache.nutch.fetcher.FetcherJob=INFO,cmdstdout
log4j.logger.org.apache.nutch.parse.ParserJob=INFO,cmdstdout
log4j.logger.org.apache.nutch.indexer.IndexerJob=INFO,cmdstdout
log4j.logger.org.apache.nutch.indexer.solr.SolrIndexerJob=INFO,cmdstdout
log4j.logger.org.apache.nutch.indexer.DeleteDuplicates=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.IndexMerger=INFO,cmdstdout
log4j.logger.org.apache.nutch.crawl.WebTableReader=INFO,cmdstdout
+log4j.logger.org.apache.nutch.host.HostDbReader=INFO,cmdstdout
log4j.logger.org.apache.nutch=INFO
log4j.logger.org.apache.hadoop=WARN
Modified: nutch/branches/nutchgora/src/bin/nutch
URL:
http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/bin/nutch?rev=1335436&r1=1335435&r2=1335436&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/bin/nutch (original)
+++ nutch/branches/nutchgora/src/bin/nutch Tue May 8 11:48:00 2012
@@ -49,11 +49,14 @@ if [ $# = 0 ]; then
echo "where COMMAND is one of:"
# echo " crawl one-step crawler for intranets"
echo " inject inject new urls into the database"
+ echo " hostinject creates or updates an existing host table from a text
file"
echo " generate generate new segments to fetch from crawl db"
echo " fetch fetch URLs marked during generate"
echo " parse parse URLs marked during fetch"
echo " updatedb update web table after parsing"
+ echo " updatehostdb update host table after parsing"
echo " readdb read/dump records from page database"
+ echo " readhostdb display entries from the hostDB"
echo " solrindex run the solr indexer on parsed segments and linkdb"
echo " solrdedup remove duplicates from solr"
echo " plugin load a plugin and run one of its classes main()"
@@ -184,6 +187,8 @@ if [ "$COMMAND" = "crawl" ] ; then
CLASS=org.apache.nutch.crawl.Crawler
elif [ "$COMMAND" = "inject" ] ; then
CLASS=org.apache.nutch.crawl.InjectorJob
+elif [ "$COMMAND" = "hostinject" ] ; then
+CLASS=org.apache.nutch.host.HostInjectorJob
elif [ "$COMMAND" = "generate" ] ; then
CLASS=org.apache.nutch.crawl.GeneratorJob
elif [ "$COMMAND" = "fetch" ] ; then
@@ -192,8 +197,12 @@ elif [ "$COMMAND" = "parse" ] ; then
CLASS=org.apache.nutch.parse.ParserJob
elif [ "$COMMAND" = "updatedb" ] ; then
CLASS=org.apache.nutch.crawl.DbUpdaterJob
+elif [ "$COMMAND" = "updatehostdb" ] ; then
+CLASS=org.apache.nutch.host.HostDbUpdateJob
elif [ "$COMMAND" = "readdb" ] ; then
CLASS=org.apache.nutch.crawl.WebTableReader
+elif [ "$COMMAND" = "readhostdb" ] ; then
+CLASS=org.apache.nutch.host.HostDbReader
elif [ "$COMMAND" = "solrindex" ] ; then
CLASS=org.apache.nutch.indexer.solr.SolrIndexerJob
elif [ "$COMMAND" = "solrdedup" ] ; then
Modified:
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/WebTableReader.java
URL:
http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/WebTableReader.java?rev=1335436&r1=1335435&r2=1335436&view=diff
==============================================================================
---
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/WebTableReader.java
(original)
+++
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/WebTableReader.java
Tue May 8 11:48:00 2012
@@ -419,21 +419,17 @@ public class WebTableReader extends Nutc
public int run(String[] args) throws Exception {
if (args.length < 1) {
System.err
- .println("Usage: WebTableReader (-stats | -url [url] | -dump
<out_dir> [-regex regex]) [-crawlId <id>] [-content] [-headers] [-links]
[-text]");
- System.err.println("\t-crawlId <id>\t the id to prefix the schemas to
operate on, (default: storage.crawl.id)");
- System.err
- .println("\t-stats [-sort] \tprint overall statistics to
System.out");
- System.err.println("\t\t[-sort]\tlist status sorted by host");
- System.err
- .println("\t-url <url>\tprint information on <url> to System.out");
- System.err
- .println("\t-dump <out_dir> [-regex regex]\tdump the webtable to a
text file in <out_dir>");
- System.err.println("\t\t-content\tdump also raw content");
- System.err.println("\t\t-headers\tdump protocol headers");
- System.err.println("\t\t-links\tdump links");
- System.err.println("\t\t-text\tdump extracted text");
- System.err
- .println("\t\t[-regex]\tfilter on the URL of the webtable entry");
+ .println("Usage: WebTableReader (-stats | -url [url] | -dump
<out_dir> [-regex regex]) \n \t \t [-crawlId <id>] [-content] [-headers]
[-links] [-text]");
+ System.err.println(" -crawlId <id> - the id to prefix the schemas to
operate on, \n \t \t (default: storage.crawl.id)");
+ System.err.println(" -stats [-sort] - print overall statistics to
System.out");
+ System.err.println(" [-sort] - list status sorted by host");
+ System.err.println(" -url <url> - print information on <url> to
System.out");
+ System.err.println(" -dump <out_dir> [-regex regex] - dump the
webtable to a text file in \n \t \t <out_dir>");
+ System.err.println(" -content - dump also raw content");
+ System.err.println(" -headers - dump protocol headers");
+ System.err.println(" -links - dump links");
+ System.err.println(" -text - dump extracted text");
+ System.err.println(" [-regex] - filter on the URL of the
webtable entry");
return -1;
}
String param = null;
Modified:
nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherJob.java
URL:
http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherJob.java?rev=1335436&r1=1335435&r2=1335436&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherJob.java
(original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherJob.java
Tue May 8 11:48:00 2012
@@ -110,7 +110,7 @@ public class FetcherJob extends NutchToo
Utf8 mark = Mark.GENERATE_MARK.checkMark(page);
if (!NutchJob.shouldProcess(mark, batchId)) {
if (LOG.isDebugEnabled()) {
- LOG.debug("Skipping " + TableUtil.unreverseUrl(key) + "; different
batch id");
+ LOG.debug("Skipping " + TableUtil.unreverseUrl(key) + "; different
batch id (" + mark + ")");
}
return;
}
@@ -265,12 +265,12 @@ public class FetcherJob extends NutchToo
String batchId;
String usage = "Usage: FetcherJob (<batchId> | -all) [-crawlId <id>] " +
- "[-threads N] [-parse] [-resume] [-numTasks N]\n" +
- "\tbatchId\tcrawl identifier returned by Generator, or -all for all
generated batchId-s\n" +
- "\t-crawlId <id>\t the id to prefix the schemas to operate on, (default:
storage.crawl.id)\n" +
- "\t-threads N\tnumber of fetching threads per task\n" +
- "\t-resume\tresume interrupted job\n" +
- "\t-numTasks N\tif N > 0 then use this many reduce tasks for fetching
(default: mapred.map.tasks)";
+ "[-threads N] [-parse] \n \t \t [-resume] [-numTasks N]\n" +
+ " <batchId> - crawl identifier returned by Generator, or -all for
all \n \t \t generated batchId-s\n" +
+ " -crawlId <id> - the id to prefix the schemas to operate on, \n \t
\t (default: storage.crawl.id)\n" +
+ " -threads N - number of fetching threads per task\n" +
+ " -resume - resume interrupted job\n" +
+ " -numTasks N - if N > 0 then use this many reduce tasks for
fetching \n \t \t (default: mapred.map.tasks)";
if (args.length == 0) {
System.err.println(usage);
Modified:
nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/IndexerJob.java
URL:
http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/IndexerJob.java?rev=1335436&r1=1335435&r2=1335436&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/IndexerJob.java
(original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/IndexerJob.java
Tue May 8 11:48:00 2012
@@ -80,7 +80,7 @@ public abstract class IndexerJob extends
if (!batchId.equals(REINDEX)) {
if (!NutchJob.shouldProcess(mark, batchId)) {
if (LOG.isDebugEnabled()) {
- LOG.debug("Skipping " + TableUtil.unreverseUrl(key) + "; different
batch id");
+ LOG.debug("Skipping " + TableUtil.unreverseUrl(key) + "; different
batch id (" + mark + ")");
}
return;
}
Modified:
nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java
URL:
http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java?rev=1335436&r1=1335435&r2=1335436&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java
(original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java Tue
May 8 11:48:00 2012
@@ -269,11 +269,11 @@ public class ParserJob extends NutchTool
if (args.length < 1) {
System.err.println("Usage: ParserJob (<batchId> | -all) [-crawlId <id>]
[-resume] [-force]");
- System.err.println("\tbatchId\tsymbolic batch ID created by Generator");
- System.err.println("\t-crawlId <id>\t the id to prefix the schemas to
operate on, (default: storage.crawl.id)");
- System.err.println("\t-all\tconsider pages from all crawl jobs");
- System.err.println("-resume\tresume a previous incomplete job");
- System.err.println("-force\tforce re-parsing even if a page is already
parsed");
+ System.err.println(" <batchId> - symbolic batch ID created by
Generator");
+ System.err.println(" -crawlId <id> - the id to prefix the schemas to
operate on, \n \t \t (default: storage.crawl.id)");
+ System.err.println(" -all - consider pages from all crawl
jobs");
+ System.err.println(" -resume - resume a previous incomplete
job");
+ System.err.println(" -force - force re-parsing even if a page
is already parsed");
return -1;
}
for (int i = 0; i < args.length; i++) {