Author: jnioche Date: Tue Jan 5 10:14:49 2010 New Revision: 895972 URL: http://svn.apache.org/viewvc?rev=895972&view=rev Log: NUTCH-658 : Add Counter for # of doc fetched in Reporter
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=895972&r1=895971&r2=895972&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Jan 5 10:14:49 2010 @@ -2,6 +2,8 @@ Unreleased Changes +* NUTCH-658 Use counters to report fetching and parsing status (jnioche) + * NUTCH-777 Upgrading to jetty6 broke unit tests (mattmann) * NUTCH-767 Update Tika to v0.5 for the MimeType detection (Julien Nioche via ab) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=895972&r1=895971&r2=895972&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Tue Jan 5 10:14:49 2010 @@ -607,6 +607,7 @@ LOG.debug("Denied by robots.txt: " + fit.url); } output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE); + reporter.incrCounter("FetcherStatus", "robots_denied", 1); continue; } if (rules.getCrawlDelay() > 0) { @@ -615,6 +616,7 @@ fetchQueues.finishFetchItem(fit, true); LOG.debug("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping"); output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE); + reporter.incrCounter("FetcherStatus", "robots_denied_maxcrawldelay", 1); continue; } else { FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID); @@ -630,6 +632,8 @@ String urlString = fit.url.toString(); + reporter.incrCounter("FetcherStatus", status.getName(), 1); + switch(status.getCode()) { case ProtocolStatus.WOULDBLOCK: @@ -664,6 +668,7 @@ } else { // stop redirecting redirecting = false; + reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1); } } } @@ -701,6 +706,7 @@ } else { // stop redirecting redirecting = false; + reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1); } } else { // stop redirecting @@ -926,6 +932,7 @@ if (parseResult != null && !parseResult.isEmpty()) { Parse p = parseResult.get(content.getUrl()); if (p != null) { + reporter.incrCounter("ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()], 1); return p.getData().getStatus(); } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=895972&r1=895971&r2=895972&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Tue Jan 5 10:14:49 2010 @@ -93,6 +93,8 @@ Parse parse = entry.getValue(); ParseStatus parseStatus = parse.getData().getStatus(); + reporter.incrCounter("ParserStatus", ParseStatus.majorCodes[parseStatus.getMajorCode()], 1); + if (!parseStatus.isSuccess()) { LOG.warn("Error parsing: " + key + ": " + parseStatus); parse = parseStatus.getEmptyParse(getConf()); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java?rev=895972&r1=895971&r2=895972&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java Tue Jan 5 10:14:49 2010 @@ -191,6 +191,10 @@ public int getCode() { return code; } + + public String getName() { + return codeToName.get(this.code); + } public void setCode(int code) { this.code = code;