This is an automated email from the ASF dual-hosted git repository.
lewismc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new f71bab402 NUTCH-3132 Standardize existing Nutch metrics naming and
implementation (#871)
f71bab402 is described below
commit f71bab402821d2c30418453b051c38a165beaf66
Author: Lewis John McGibbney <[email protected]>
AuthorDate: Thu Dec 11 08:47:30 2025 -0800
NUTCH-3132 Standardize existing Nutch metrics naming and implementation
(#871)
---
src/java/org/apache/nutch/crawl/CrawlDbFilter.java | 12 +-
.../org/apache/nutch/crawl/CrawlDbReducer.java | 7 +-
.../org/apache/nutch/crawl/DeduplicationJob.java | 5 +-
src/java/org/apache/nutch/crawl/Generator.java | 37 +-
src/java/org/apache/nutch/crawl/Injector.java | 37 +-
src/java/org/apache/nutch/fetcher/Fetcher.java | 21 +-
.../org/apache/nutch/fetcher/FetcherThread.java | 75 ++++-
src/java/org/apache/nutch/fetcher/QueueFeeder.java | 15 +-
.../org/apache/nutch/hostdb/ResolverThread.java | 26 +-
.../apache/nutch/hostdb/UpdateHostDbMapper.java | 13 +-
.../apache/nutch/hostdb/UpdateHostDbReducer.java | 10 +-
src/java/org/apache/nutch/indexer/CleaningJob.java | 4 +-
.../org/apache/nutch/indexer/IndexerMapReduce.java | 31 +-
.../org/apache/nutch/metrics/NutchMetrics.java | 371 +++++++++++++++++++++
.../org/apache/nutch/metrics/package-info.java | 32 ++
src/java/org/apache/nutch/parse/ParseSegment.java | 4 +-
.../apache/nutch/scoring/webgraph/WebGraph.java | 7 +-
.../org/apache/nutch/tools/warc/WARCExporter.java | 40 ++-
.../org/apache/nutch/util/SitemapProcessor.java | 26 +-
19 files changed, 651 insertions(+), 122 deletions(-)
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
index d9ab0d3cc..7f28a3a85 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
@@ -24,6 +24,7 @@ import org.slf4j.LoggerFactory;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
@@ -80,15 +81,15 @@ public class CrawlDbFilter extends
// https://issues.apache.org/jira/browse/NUTCH-1101 check status first,
// cheaper than normalizing or filtering
if (url404Purging && CrawlDatum.STATUS_DB_GONE == value.getStatus()) {
- context.getCounter("CrawlDB filter",
- "Gone records removed").increment(1);
+ context.getCounter(NutchMetrics.GROUP_CRAWLDB_FILTER,
+ NutchMetrics.CRAWLDB_GONE_RECORDS_REMOVED_TOTAL).increment(1);
return;
}
// Whether to remove orphaned pages
// https://issues.apache.org/jira/browse/NUTCH-1932
if (purgeOrphans && CrawlDatum.STATUS_DB_ORPHAN == value.getStatus()) {
- context.getCounter("CrawlDB filter",
- "Orphan records removed").increment(1);
+ context.getCounter(NutchMetrics.GROUP_CRAWLDB_FILTER,
+ NutchMetrics.CRAWLDB_ORPHAN_RECORDS_REMOVED_TOTAL).increment(1);
return;
}
if (url != null && urlNormalizers) {
@@ -108,7 +109,8 @@ public class CrawlDbFilter extends
}
}
if (url == null) {
- context.getCounter("CrawlDB filter", "URLs filtered").increment(1);
+ context.getCounter(NutchMetrics.GROUP_CRAWLDB_FILTER,
+ NutchMetrics.CRAWLDB_URLS_FILTERED_TOTAL).increment(1);
} else {
// URL has passed filters
newKey.set(url); // collect it
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
b/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
index deb266af6..e263f8463 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
@@ -31,6 +31,7 @@ import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.PriorityQueue;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.StringUtil;
@@ -163,7 +164,8 @@ public class CrawlDbReducer extends
LOG.warn("Couldn't update orphaned score, key={}: {}", key, e);
}
context.write(key, old);
- context.getCounter("CrawlDB status",
+ // Dynamic counter based on status name
+ context.getCounter(NutchMetrics.GROUP_CRAWLDB,
CrawlDatum.getStatusName(old.getStatus())).increment(1);
} else {
LOG.warn("Missing fetch and old value, signature={}",
@@ -319,7 +321,8 @@ public class CrawlDbReducer extends
// remove generation time, if any
result.getMetaData().remove(Nutch.WRITABLE_GENERATE_TIME_KEY);
context.write(key, result);
- context.getCounter("CrawlDB status",
+ // Dynamic counter based on status name
+ context.getCounter(NutchMetrics.GROUP_CRAWLDB,
CrawlDatum.getStatusName(result.getStatus())).increment(1);
}
diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java
b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
index 3e12d4598..cdb291fe8 100644
--- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java
+++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
@@ -45,6 +45,7 @@ import
org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
@@ -139,8 +140,8 @@ public class DeduplicationJob extends NutchTool implements
Tool {
throws IOException, InterruptedException {
datum.setStatus(CrawlDatum.STATUS_DB_DUPLICATE);
Text key = (Text) datum.getMetaData().remove(urlKey);
- context.getCounter("DeduplicationJobStatus",
- "Documents marked as duplicate").increment(1);
+ context.getCounter(NutchMetrics.GROUP_DEDUP,
+ NutchMetrics.DEDUP_DOCUMENTS_MARKED_DUPLICATE_TOTAL).increment(1);
context.write(key, datum);
}
diff --git a/src/java/org/apache/nutch/crawl/Generator.java
b/src/java/org/apache/nutch/crawl/Generator.java
index 82475af5b..db15f0426 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -67,6 +67,7 @@ import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.nutch.hostdb.HostDatum;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
@@ -225,11 +226,13 @@ public class Generator extends NutchTool implements Tool {
// URLFilters
try {
if (filters.filter(url.toString()) == null) {
- context.getCounter("Generator",
"URL_FILTERS_REJECTED").increment(1);
+ context.getCounter(NutchMetrics.GROUP_GENERATOR,
+
NutchMetrics.GENERATOR_URL_FILTERS_REJECTED_TOTAL).increment(1);
return;
}
} catch (URLFilterException e) {
- context.getCounter("Generator", "URL_FILTER_EXCEPTION").increment(1);
+ context.getCounter(NutchMetrics.GROUP_GENERATOR,
+ NutchMetrics.GENERATOR_URL_FILTER_EXCEPTION_TOTAL).increment(1);
LOG.warn("Couldn't filter url: {} ({})", url, e.getMessage());
}
}
@@ -239,7 +242,8 @@ public class Generator extends NutchTool implements Tool {
if (!schedule.shouldFetch(url, crawlDatum, curTime)) {
LOG.debug("-shouldFetch rejected '{}', fetchTime={}, curTime={}", url,
crawlDatum.getFetchTime(), curTime);
- context.getCounter("Generator", "SCHEDULE_REJECTED").increment(1);
+ context.getCounter(NutchMetrics.GROUP_GENERATOR,
+ NutchMetrics.GENERATOR_SCHEDULE_REJECTED_TOTAL).increment(1);
return;
}
@@ -248,7 +252,8 @@ public class Generator extends NutchTool implements Tool {
if (oldGenTime != null) { // awaiting fetch & update
if (oldGenTime.get() + genDelay > curTime) { // still wait for
// update
- context.getCounter("Generator", "WAIT_FOR_UPDATE").increment(1);
+ context.getCounter(NutchMetrics.GROUP_GENERATOR,
+ NutchMetrics.GENERATOR_WAIT_FOR_UPDATE_TOTAL).increment(1);
return;
}
}
@@ -262,19 +267,22 @@ public class Generator extends NutchTool implements Tool {
// check expr
if (expr != null) {
if (!crawlDatum.execute(expr, key.toString())) {
- context.getCounter("Generator", "EXPR_REJECTED").increment(1);
+ context.getCounter(NutchMetrics.GROUP_GENERATOR,
+ NutchMetrics.GENERATOR_EXPR_REJECTED_TOTAL).increment(1);
return;
}
}
if (restrictStatus != -1 && restrictStatus != crawlDatum.getStatus()) {
- context.getCounter("Generator", "STATUS_REJECTED").increment(1);
+ context.getCounter(NutchMetrics.GROUP_GENERATOR,
+ NutchMetrics.GENERATOR_STATUS_REJECTED_TOTAL).increment(1);
return;
}
// consider only entries with a score superior to the threshold
if (!Float.isNaN(scoreThreshold) && sort < scoreThreshold) {
- context.getCounter("Generator", "SCORE_TOO_LOW").increment(1);
+ context.getCounter(NutchMetrics.GROUP_GENERATOR,
+ NutchMetrics.GENERATOR_SCORE_TOO_LOW_TOTAL).increment(1);
return;
}
@@ -282,7 +290,8 @@ public class Generator extends NutchTool implements Tool {
// threshold
if (intervalThreshold != -1
&& crawlDatum.getFetchInterval() > intervalThreshold) {
- context.getCounter("Generator", "INTERVAL_REJECTED").increment(1);
+ context.getCounter(NutchMetrics.GROUP_GENERATOR,
+ NutchMetrics.GENERATOR_INTERVAL_REJECTED_TOTAL).increment(1);
return;
}
@@ -507,7 +516,8 @@ public class Generator extends NutchTool implements Tool {
} catch (MalformedURLException e) {
LOG.warn("Malformed URL: '{}', skipping ({})", urlString,
StringUtils.stringifyException(e));
- context.getCounter("Generator", "MALFORMED_URL").increment(1);
+ context.getCounter(NutchMetrics.GROUP_GENERATOR,
+ NutchMetrics.GENERATOR_MALFORMED_URL_TOTAL).increment(1);
continue;
}
@@ -539,16 +549,15 @@ public class Generator extends NutchTool implements Tool {
hostCount[1] = 1;
} else {
if (hostCount[1] == (maxCount+1)) {
- context
- .getCounter("Generator",
"HOSTS_AFFECTED_PER_HOST_OVERFLOW")
- .increment(1);
+ context.getCounter(NutchMetrics.GROUP_GENERATOR,
+
NutchMetrics.GENERATOR_HOSTS_AFFECTED_PER_HOST_OVERFLOW_TOTAL).increment(1);
LOG.info(
"Host or domain {} has more than {} URLs for all {}
segments. Additional URLs won't be included in the fetchlist.",
hostordomain, maxCount, maxNumSegments);
}
// skip this entry
- context.getCounter("Generator", "URLS_SKIPPED_PER_HOST_OVERFLOW")
- .increment(1);
+ context.getCounter(NutchMetrics.GROUP_GENERATOR,
+
NutchMetrics.GENERATOR_URLS_SKIPPED_PER_HOST_OVERFLOW_TOTAL).increment(1);
continue;
}
}
diff --git a/src/java/org/apache/nutch/crawl/Injector.java
b/src/java/org/apache/nutch/crawl/Injector.java
index 819c91e3a..03a54f1eb 100644
--- a/src/java/org/apache/nutch/crawl/Injector.java
+++ b/src/java/org/apache/nutch/crawl/Injector.java
@@ -36,6 +36,7 @@ import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.scoring.ScoringFilterException;
@@ -218,7 +219,8 @@ public class Injector extends NutchTool implements Tool {
url = filterNormalize(url);
if (url == null) {
- context.getCounter("injector", "urls_filtered").increment(1);
+ context.getCounter(NutchMetrics.GROUP_INJECTOR,
+ NutchMetrics.INJECTOR_URLS_FILTERED_TOTAL).increment(1);
} else {
CrawlDatum datum = new CrawlDatum();
datum.setStatus(CrawlDatum.STATUS_INJECTED);
@@ -238,7 +240,8 @@ public class Injector extends NutchTool implements Tool {
"Cannot filter injected score for url {}, using default ({})",
url, e.getMessage());
}
- context.getCounter("injector", "urls_injected").increment(1);
+ context.getCounter(NutchMetrics.GROUP_INJECTOR,
+ NutchMetrics.INJECTOR_URLS_INJECTED_TOTAL).increment(1);
context.write(key, datum);
}
} else if (value instanceof CrawlDatum) {
@@ -248,14 +251,16 @@ public class Injector extends NutchTool implements Tool {
// remove 404 urls
if (url404Purging && CrawlDatum.STATUS_DB_GONE == datum.getStatus()) {
- context.getCounter("injector", "urls_purged_404").increment(1);
+ context.getCounter(NutchMetrics.GROUP_INJECTOR,
+ NutchMetrics.INJECTOR_URLS_PURGED_404_TOTAL).increment(1);
return;
}
if (filterNormalizeAll) {
String url = filterNormalize(key.toString());
if (url == null) {
- context.getCounter("injector", "urls_purged_filter").increment(1);
+ context.getCounter(NutchMetrics.GROUP_INJECTOR,
+ NutchMetrics.INJECTOR_URLS_PURGED_FILTER_TOTAL).increment(1);
} else {
key.set(url);
context.write(key, datum);
@@ -341,9 +346,11 @@ public class Injector extends NutchTool implements Tool {
}
}
if (injectedSet) {
- context.getCounter("injector", "urls_injected_unique").increment(1);
+ context.getCounter(NutchMetrics.GROUP_INJECTOR,
+ NutchMetrics.INJECTOR_URLS_INJECTED_UNIQUE_TOTAL).increment(1);
if (oldSet) {
- context.getCounter("injector", "urls_merged").increment(1);
+ context.getCounter(NutchMetrics.GROUP_INJECTOR,
+ NutchMetrics.INJECTOR_URLS_MERGED_TOTAL).increment(1);
}
}
context.write(key, result);
@@ -454,17 +461,23 @@ public class Injector extends NutchTool implements Tool {
if (LOG.isInfoEnabled()) {
long urlsInjected = job.getCounters()
- .findCounter("injector", "urls_injected").getValue();
+ .findCounter(NutchMetrics.GROUP_INJECTOR,
+ NutchMetrics.INJECTOR_URLS_INJECTED_TOTAL).getValue();
long urlsInjectedUniq = job.getCounters()
- .findCounter("injector", "urls_injected_unique").getValue();
+ .findCounter(NutchMetrics.GROUP_INJECTOR,
+ NutchMetrics.INJECTOR_URLS_INJECTED_UNIQUE_TOTAL).getValue();
long urlsFiltered = job.getCounters()
- .findCounter("injector", "urls_filtered").getValue();
+ .findCounter(NutchMetrics.GROUP_INJECTOR,
+ NutchMetrics.INJECTOR_URLS_FILTERED_TOTAL).getValue();
long urlsMerged = job.getCounters()
- .findCounter("injector", "urls_merged").getValue();
+ .findCounter(NutchMetrics.GROUP_INJECTOR,
+ NutchMetrics.INJECTOR_URLS_MERGED_TOTAL).getValue();
long urlsPurged404 = job.getCounters()
- .findCounter("injector", "urls_purged_404").getValue();
+ .findCounter(NutchMetrics.GROUP_INJECTOR,
+ NutchMetrics.INJECTOR_URLS_PURGED_404_TOTAL).getValue();
long urlsPurgedFilter = job.getCounters()
- .findCounter("injector", "urls_purged_filter").getValue();
+ .findCounter(NutchMetrics.GROUP_INJECTOR,
+ NutchMetrics.INJECTOR_URLS_PURGED_FILTER_TOTAL).getValue();
LOG.info("Injector: Total urls rejected by filters: {}", urlsFiltered);
LOG.info(
"Injector: Total urls injected after normalization and filtering:
{} (unique URLs: {})",
diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java
b/src/java/org/apache/nutch/fetcher/Fetcher.java
index ead0167db..01144f493 100644
--- a/src/java/org/apache/nutch/fetcher/Fetcher.java
+++ b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -48,6 +48,7 @@ import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
@@ -291,8 +292,8 @@ public class Fetcher extends NutchTool implements Tool {
pagesLastSec = pages.get() - pagesLastSec;
bytesLastSec = (int) bytes.get() - bytesLastSec;
- innerContext.getCounter("FetcherStatus", "bytes_downloaded")
- .increment(bytesLastSec);
+ innerContext.getCounter(NutchMetrics.GROUP_FETCHER,
+
NutchMetrics.FETCHER_BYTES_DOWNLOADED_TOTAL).increment(bytesLastSec);
reportStatus(innerContext, fetchQueues, pagesLastSec, bytesLastSec);
@@ -330,8 +331,8 @@ public class Fetcher extends NutchTool implements Tool {
int hitByThrougputThreshold = fetchQueues.emptyQueues();
if (hitByThrougputThreshold != 0)
- innerContext
- .getCounter("FetcherStatus", "hitByThrougputThreshold")
+ innerContext.getCounter(NutchMetrics.GROUP_FETCHER,
+ NutchMetrics.FETCHER_HIT_BY_THROUGHPUT_THRESHOLD_TOTAL)
.increment(hitByThrougputThreshold);
}
}
@@ -413,8 +414,8 @@ public class Fetcher extends NutchTool implements Tool {
if (!feeder.isAlive()) {
int hitByTimeLimit = fetchQueues.checkTimelimit();
if (hitByTimeLimit != 0)
- innerContext.getCounter("FetcherStatus", "hitByTimeLimit")
- .increment(hitByTimeLimit);
+ innerContext.getCounter(NutchMetrics.GROUP_FETCHER,
+
NutchMetrics.FETCHER_HIT_BY_TIMELIMIT_TOTAL).increment(hitByTimeLimit);
}
/*
@@ -430,8 +431,8 @@ public class Fetcher extends NutchTool implements Tool {
timeout);
LOG.warn("Aborting with {} hung threads{}.", activeThreads,
feeder.isAlive() ? " (queue feeder still alive)" : "");
- innerContext.getCounter("FetcherStatus", "hungThreads")
- .increment(activeThreads.get());
+ innerContext.getCounter(NutchMetrics.GROUP_FETCHER,
+
NutchMetrics.FETCHER_HUNG_THREADS_TOTAL).increment(activeThreads.get());
for (int i = 0; i < fetcherThreads.size(); i++) {
FetcherThread thread = fetcherThreads.get(i);
if (thread.isAlive()) {
@@ -466,8 +467,8 @@ public class Fetcher extends NutchTool implements Tool {
fetchQueues.getTotalSize(), fetchQueues.getQueueCount(),
feeder.isAlive() ? " (queue feeder still alive)" : "");
int hitByTimeout = fetchQueues.emptyQueues();
- innerContext.getCounter("FetcherStatus", "hitByTimeout")
- .increment(hitByTimeout);
+ innerContext.getCounter(NutchMetrics.GROUP_FETCHER,
+
NutchMetrics.FETCHER_HIT_BY_TIMEOUT_TOTAL).increment(hitByTimeout);
return;
}
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java
b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index f886aed42..8b4e5c95c 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -31,12 +31,14 @@ import java.util.concurrent.atomic.AtomicLong;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.fetcher.Fetcher.FetcherRun;
import org.apache.nutch.fetcher.FetcherThreadEvent.PublishEventType;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLExemptionFilters;
@@ -152,6 +154,18 @@ public class FetcherThread extends Thread {
private ProtocolLogUtil logUtil = new ProtocolLogUtil();
+ // Cached counters for performance (avoid repeated lookups in hot paths)
+ private Counter robotsDeniedCounter;
+ private Counter robotsDeniedMaxCrawlDelayCounter;
+ private Counter robotsDeferVisitsDroppedCounter;
+ private Counter redirectCountExceededCounter;
+ private Counter redirectDeduplicatedCounter;
+ private Counter redirectNotCreatedCounter;
+ private Counter hitByTimeLimitCounter;
+ private Counter aboveExceptionThresholdCounter;
+ private Counter outlinksDetectedCounter;
+ private Counter outlinksFollowingCounter;
+
public FetcherThread(Configuration conf, AtomicInteger activeThreads,
FetchItemQueues fetchQueues,
QueueFeeder feeder, AtomicInteger spinWaiting, AtomicLong
lastRequestStart, FetcherRun.Context context,
AtomicInteger errors, String segmentName, boolean parsing, boolean
storingContent,
@@ -241,6 +255,35 @@ public class FetcherThread extends Thread {
getName(), Thread.currentThread().getId());
}
}
+
+ // Initialize cached counters for performance
+ initCounters();
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot
paths.
+ */
+ private void initCounters() {
+ robotsDeniedCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_ROBOTS_DENIED_TOTAL);
+ robotsDeniedMaxCrawlDelayCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER,
NutchMetrics.FETCHER_ROBOTS_DENIED_MAXCRAWLDELAY_TOTAL);
+ robotsDeferVisitsDroppedCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER,
NutchMetrics.FETCHER_ROBOTS_DEFER_VISITS_DROPPED_TOTAL);
+ redirectCountExceededCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER,
NutchMetrics.FETCHER_REDIRECT_COUNT_EXCEEDED_TOTAL);
+ redirectDeduplicatedCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER,
NutchMetrics.FETCHER_REDIRECT_DEDUPLICATED_TOTAL);
+ redirectNotCreatedCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER,
NutchMetrics.FETCHER_REDIRECT_NOT_CREATED_TOTAL);
+ hitByTimeLimitCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER,
NutchMetrics.FETCHER_HIT_BY_TIMELIMIT_TOTAL);
+ aboveExceptionThresholdCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER,
NutchMetrics.FETCHER_ABOVE_EXCEPTION_THRESHOLD_TOTAL);
+ outlinksDetectedCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER_OUTLINKS,
NutchMetrics.FETCHER_OUTLINKS_DETECTED_TOTAL);
+ outlinksFollowingCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER_OUTLINKS,
NutchMetrics.FETCHER_OUTLINKS_FOLLOWING_TOTAL);
}
@Override
@@ -334,9 +377,7 @@ public class FetcherThread extends Thread {
fit.getQueueID(), this.robotsDeferVisitsRetries + 1,
this.robotsDeferVisitsDelay);
if (killedURLs != 0) {
- context
- .getCounter("FetcherStatus", "robots_defer_visits_dropped")
- .increment(killedURLs);
+ robotsDeferVisitsDroppedCounter.increment(killedURLs);
}
continue;
}
@@ -347,7 +388,7 @@ public class FetcherThread extends Thread {
output(fit.url, fit.datum, null,
ProtocolStatus.STATUS_ROBOTS_DENIED,
CrawlDatum.STATUS_FETCH_GONE);
- context.getCounter("FetcherStatus",
"robots_denied").increment(1);
+ robotsDeniedCounter.increment(1);
continue;
}
if (rules.getCrawlDelay() > 0) {
@@ -359,8 +400,7 @@ public class FetcherThread extends Thread {
output(fit.url, fit.datum, null,
ProtocolStatus.STATUS_ROBOTS_DENIED,
CrawlDatum.STATUS_FETCH_GONE);
- context.getCounter("FetcherStatus",
- "robots_denied_maxcrawldelay").increment(1);
+ robotsDeniedMaxCrawlDelayCounter.increment(1);
continue;
} else {
FetchItemQueue fiq =
fetchQueues.getFetchItemQueue(fit.queueID);
@@ -398,7 +438,8 @@ public class FetcherThread extends Thread {
endEvent.addEventData("status", status.getName());
publisher.publish(endEvent, conf);
}
- context.getCounter("FetcherStatus", status.getName()).increment(1);
+ // Dynamic counter for protocol status - can't cache as status
varies
+ context.getCounter(NutchMetrics.GROUP_FETCHER,
status.getName()).increment(1);
switch (status.getCode()) {
@@ -447,8 +488,7 @@ public class FetcherThread extends Thread {
int killedURLs = fetchQueues
.checkExceptionThreshold(fit.getQueueID());
if (killedURLs != 0)
- context.getCounter("FetcherStatus",
- "AboveExceptionThresholdInQueue").increment(killedURLs);
+ aboveExceptionThresholdCounter.increment(killedURLs);
/* FALLTHROUGH */
case ProtocolStatus.RETRY: // retry
@@ -478,8 +518,7 @@ public class FetcherThread extends Thread {
if (redirecting && redirectCount > maxRedirect) {
fetchQueues.finishFetchItem(fit);
- context.getCounter("FetcherStatus", "redirect_count_exceeded")
- .increment(1);
+ redirectCountExceededCounter.increment(1);
LOG.info("{} {} - redirect count exceeded {} ({})", getName(),
Thread.currentThread().getId(), fit.url,
maxRedirectExceededSkip ? "skipped" : "linked");
@@ -613,13 +652,13 @@ public class FetcherThread extends Thread {
throws ScoringFilterException {
if (fetchQueues.redirectIsQueuedRecently(redirUrl)) {
redirecting = false;
- context.getCounter("FetcherStatus",
"redirect_deduplicated").increment(1);
+ redirectDeduplicatedCounter.increment(1);
LOG.debug(" - ignoring redirect from {} to {} as duplicate", fit.url,
redirUrl);
return null;
} else if (fetchQueues.timelimitExceeded()) {
redirecting = false;
- context.getCounter("FetcherStatus", "hitByTimeLimit").increment(1);
+ hitByTimeLimitCounter.increment(1);
LOG.debug(" - ignoring redirect from {} to {} - timelimit reached",
fit.url, redirUrl);
return null;
@@ -632,7 +671,7 @@ public class FetcherThread extends Thread {
} else {
// stop redirecting
redirecting = false;
- context.getCounter("FetcherStatus",
"FetchItem.notCreated.redirect").increment(1);
+ redirectNotCreatedCounter.increment(1);
}
return fit;
}
@@ -805,8 +844,7 @@ public class FetcherThread extends Thread {
FetchItemQueue queue = fetchQueues.getFetchItemQueue(ft.queueID);
queue.alreadyFetched.add(url.toString().hashCode());
- context.getCounter("FetcherOutlinks",
"outlinks_detected").increment(
- outlinks.size());
+ outlinksDetectedCounter.increment(outlinks.size());
// Counter to limit num outlinks to follow per page
int outlinkCounter = 0;
@@ -838,7 +876,7 @@ public class FetcherThread extends Thread {
new CrawlDatum(CrawlDatum.STATUS_LINKED, interval),
queueMode, outlinkDepth + 1);
- context.getCounter("FetcherOutlinks",
"outlinks_following").increment(1);
+ outlinksFollowingCounter.increment(1);
fetchQueues.addFetchItem(fit);
@@ -864,7 +902,8 @@ public class FetcherThread extends Thread {
if (parseResult != null && !parseResult.isEmpty()) {
Parse p = parseResult.get(content.getUrl());
if (p != null) {
- context.getCounter("ParserStatus", ParseStatus.majorCodes[p
+ // Dynamic counter for parse status - can't cache as status varies
+ context.getCounter(NutchMetrics.GROUP_PARSER, ParseStatus.majorCodes[p
.getData().getStatus().getMajorCode()]).increment(1);
return p.getData().getStatus();
}
diff --git a/src/java/org/apache/nutch/fetcher/QueueFeeder.java
b/src/java/org/apache/nutch/fetcher/QueueFeeder.java
index c48c4b8f3..6ee973dd3 100644
--- a/src/java/org/apache/nutch/fetcher/QueueFeeder.java
+++ b/src/java/org/apache/nutch/fetcher/QueueFeeder.java
@@ -25,6 +25,7 @@ import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.fetcher.FetchItemQueues.QueuingStatus;
import org.apache.nutch.fetcher.Fetcher.FetcherRun;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
@@ -94,14 +95,16 @@ public class QueueFeeder extends Thread {
LOG.info("QueueFeeder stopping, timeout reached.");
}
queuingStatus[qstatus]++;
- context.getCounter("FetcherStatus", "hitByTimeout").increment(1);
+ context.getCounter(NutchMetrics.GROUP_FETCHER,
+ NutchMetrics.FETCHER_HIT_BY_TIMEOUT_TOTAL).increment(1);
} else {
int qstatus = QueuingStatus.HIT_BY_TIMELIMIT.ordinal();
if (queuingStatus[qstatus] == 0) {
LOG.info("QueueFeeder stopping, timelimit exceeded.");
}
queuingStatus[qstatus]++;
- context.getCounter("FetcherStatus", "hitByTimeLimit").increment(1);
+ context.getCounter(NutchMetrics.GROUP_FETCHER,
+ NutchMetrics.FETCHER_HIT_BY_TIMELIMIT_TOTAL).increment(1);
}
try {
hasMore = context.nextKeyValue();
@@ -133,7 +136,8 @@ public class QueueFeeder extends Thread {
String u = filterNormalize(url.toString());
if (u == null) {
// filtered or failed to normalize
- context.getCounter("FetcherStatus", "filtered").increment(1);
+ context.getCounter(NutchMetrics.GROUP_FETCHER,
+ NutchMetrics.FETCHER_FILTERED_TOTAL).increment(1);
continue;
}
url = new Text(u);
@@ -150,9 +154,8 @@ public class QueueFeeder extends Thread {
QueuingStatus status = queues.addFetchItem(url, datum);
queuingStatus[status.ordinal()]++;
if (status == QueuingStatus.ABOVE_EXCEPTION_THRESHOLD) {
- context
- .getCounter("FetcherStatus",
"AboveExceptionThresholdInQueue")
- .increment(1);
+ context.getCounter(NutchMetrics.GROUP_FETCHER,
+
NutchMetrics.FETCHER_ABOVE_EXCEPTION_THRESHOLD_TOTAL).increment(1);
}
cnt++;
feed--;
diff --git a/src/java/org/apache/nutch/hostdb/ResolverThread.java
b/src/java/org/apache/nutch/hostdb/ResolverThread.java
index 2140ea52d..2690a73fa 100644
--- a/src/java/org/apache/nutch/hostdb/ResolverThread.java
+++ b/src/java/org/apache/nutch/hostdb/ResolverThread.java
@@ -24,6 +24,8 @@ import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.metrics.NutchMetrics;
+
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -72,16 +74,19 @@ public class ResolverThread implements Runnable {
InetAddress inetAddr = InetAddress.getByName(host);
if (datum.isEmpty()) {
- context.getCounter("UpdateHostDb", "new_known_host").increment(1);
+ context.getCounter(NutchMetrics.GROUP_HOSTDB,
+ NutchMetrics.HOSTDB_NEW_KNOWN_HOST_TOTAL).increment(1);
datum.setLastCheck();
LOG.info("{}: new_known_host {}", host, datum);
} else if (datum.getDnsFailures() > 0) {
- context.getCounter("UpdateHostDb", "rediscovered_host").increment(1);
+ context.getCounter(NutchMetrics.GROUP_HOSTDB,
+ NutchMetrics.HOSTDB_REDISCOVERED_HOST_TOTAL).increment(1);
datum.setLastCheck();
datum.setDnsFailures(0l);
LOG.info("{}: rediscovered_host {}", host, datum);
} else {
- context.getCounter("UpdateHostDb", "existing_known_host").increment(1);
+ context.getCounter(NutchMetrics.GROUP_HOSTDB,
+ NutchMetrics.HOSTDB_EXISTING_KNOWN_HOST_TOTAL).increment(1);
datum.setLastCheck();
LOG.info("{}: existing_known_host {}", host, datum);
}
@@ -95,7 +100,8 @@ public class ResolverThread implements Runnable {
datum.setLastCheck();
datum.setDnsFailures(1l);
context.write(hostText, datum);
- context.getCounter("UpdateHostDb", "new_unknown_host").increment(1);
+ context.getCounter(NutchMetrics.GROUP_HOSTDB,
+ NutchMetrics.HOSTDB_NEW_UNKNOWN_HOST_TOTAL).increment(1);
LOG.info("{}: new_unknown_host {}", host, datum);
} else {
datum.setLastCheck();
@@ -106,15 +112,18 @@ public class ResolverThread implements Runnable {
purgeFailedHostsThreshold < datum.getDnsFailures()) {
context.write(hostText, datum);
- context.getCounter("UpdateHostDb",
"existing_unknown_host").increment(1);
+ context.getCounter(NutchMetrics.GROUP_HOSTDB,
+ NutchMetrics.HOSTDB_EXISTING_UNKNOWN_HOST_TOTAL).increment(1);
LOG.info("{}: existing_unknown_host {}", host, datum);
} else {
- context.getCounter("UpdateHostDb",
"purged_unknown_host").increment(1);
+ context.getCounter(NutchMetrics.GROUP_HOSTDB,
+ NutchMetrics.HOSTDB_PURGED_UNKNOWN_HOST_TOTAL).increment(1);
LOG.info("{}: purged_unknown_host {}", host, datum);
}
}
- context.getCounter("UpdateHostDb",
createFailureCounterLabel(datum)).increment(1);
+ // Dynamic counter based on failure count - can't cache
+ context.getCounter(NutchMetrics.GROUP_HOSTDB,
createFailureCounterLabel(datum)).increment(1);
} catch (Exception ioe) {
LOG.warn(StringUtils.stringifyException(ioe));
}
@@ -122,7 +131,8 @@ public class ResolverThread implements Runnable {
LOG.warn(StringUtils.stringifyException(e));
}
- context.getCounter("UpdateHostDb", "checked_hosts").increment(1);
+ context.getCounter(NutchMetrics.GROUP_HOSTDB,
+ NutchMetrics.HOSTDB_CHECKED_HOSTS_TOTAL).increment(1);
}
private String createFailureCounterLabel(HostDatum datum) {
diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java
b/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java
index ca6797ac0..1495f7491 100644
--- a/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java
+++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java
@@ -30,6 +30,7 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.protocol.ProtocolStatus;
@@ -136,7 +137,8 @@ public class UpdateHostDbMapper
try {
url = new URL(keyStr);
} catch (MalformedURLException e) {
- context.getCounter("UpdateHostDb", "malformed_url").increment(1);
+ context.getCounter(NutchMetrics.GROUP_HOSTDB,
+ NutchMetrics.HOSTDB_MALFORMED_URL_TOTAL).increment(1);
return;
}
String hostName = URLUtil.getHost(url);
@@ -146,7 +148,8 @@ public class UpdateHostDbMapper
// Filtered out?
if (buffer == null) {
- context.getCounter("UpdateHostDb", "filtered_records").increment(1);
+ context.getCounter(NutchMetrics.GROUP_HOSTDB,
+ NutchMetrics.HOSTDB_FILTERED_RECORDS_TOTAL).increment(1);
LOG.debug("UpdateHostDb: {} crawldatum has been filtered", hostName);
return;
}
@@ -219,7 +222,8 @@ public class UpdateHostDbMapper
// Filtered out?
if (buffer == null) {
- context.getCounter("UpdateHostDb", "filtered_records").increment(1);
+ context.getCounter(NutchMetrics.GROUP_HOSTDB,
+ NutchMetrics.HOSTDB_FILTERED_RECORDS_TOTAL).increment(1);
LOG.debug("UpdateHostDb: {} hostdatum has been filtered", keyStr);
return;
}
@@ -243,7 +247,8 @@ public class UpdateHostDbMapper
// Filtered out?
if (buffer == null) {
- context.getCounter("UpdateHostDb", "filtered_records").increment(1);
+ context.getCounter(NutchMetrics.GROUP_HOSTDB,
+ NutchMetrics.HOSTDB_FILTERED_RECORDS_TOTAL).increment(1);
LOG.debug("UpdateHostDb: {} score has been filtered", keyStr);
return;
}
diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
index 1431b5636..039fa5ba1 100644
--- a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
+++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
@@ -36,6 +36,7 @@ import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
+import org.apache.nutch.metrics.NutchMetrics;
import com.tdunning.math.stats.TDigest;
@@ -379,12 +380,14 @@ public class UpdateHostDbReducer
// Impose limits on minimum number of URLs?
if (urlLimit > -1l) {
if (hostDatum.numRecords() < urlLimit) {
- context.getCounter("UpdateHostDb",
"url_limit_not_reached").increment(1);
+ context.getCounter(NutchMetrics.GROUP_HOSTDB,
+ NutchMetrics.HOSTDB_URL_LIMIT_NOT_REACHED_TOTAL).increment(1);
return;
}
}
- context.getCounter("UpdateHostDb", "total_hosts").increment(1);
+ context.getCounter(NutchMetrics.GROUP_HOSTDB,
+ NutchMetrics.HOSTDB_TOTAL_HOSTS_TOTAL).increment(1);
// See if this record is to be checked
if (shouldCheck(hostDatum)) {
@@ -401,7 +404,8 @@ public class UpdateHostDbReducer
// Do not progress, the datum will be written in the resolver thread
return;
} else if (checkAny) {
- context.getCounter("UpdateHostDb", "skipped_not_eligible").increment(1);
+ context.getCounter(NutchMetrics.GROUP_HOSTDB,
+ NutchMetrics.HOSTDB_SKIPPED_NOT_ELIGIBLE_TOTAL).increment(1);
LOG.debug("UpdateHostDb: {}: skipped_not_eligible", key);
}
diff --git a/src/java/org/apache/nutch/indexer/CleaningJob.java
b/src/java/org/apache/nutch/indexer/CleaningJob.java
index cedee8e34..ae01e4b0d 100644
--- a/src/java/org/apache/nutch/indexer/CleaningJob.java
+++ b/src/java/org/apache/nutch/indexer/CleaningJob.java
@@ -36,6 +36,7 @@ import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.CrawlDb;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.slf4j.Logger;
@@ -118,7 +119,8 @@ public class CleaningJob implements Tool {
for (Text document : values) {
writers.delete(document.toString());
totalDeleted++;
- context.getCounter("CleaningJobStatus", "Deleted
documents").increment(1);
+ context.getCounter(NutchMetrics.GROUP_CLEANING,
+ NutchMetrics.CLEANING_DELETED_DOCUMENTS_TOTAL).increment(1);
}
}
}
diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
index 9fb800771..33f2f244a 100644
--- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
+++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
@@ -40,6 +40,7 @@ import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.crawl.LinkDb;
import org.apache.nutch.crawl.NutchWritable;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
@@ -283,7 +284,8 @@ public class IndexerMapReduce extends Configured {
.indexOf("noindex") != -1) {
// Delete it!
context.write(key, DELETE_ACTION);
- context.getCounter("IndexerStatus", "deleted
(robots=noindex)").increment(1);
+ context.getCounter(NutchMetrics.GROUP_INDEXER,
+
NutchMetrics.INDEXER_DELETED_ROBOTS_NOINDEX_TOTAL).increment(1);
return;
}
}
@@ -300,7 +302,8 @@ public class IndexerMapReduce extends Configured {
if (delete && fetchDatum != null) {
if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE
|| dbDatum != null && dbDatum.getStatus() ==
CrawlDatum.STATUS_DB_GONE) {
- context.getCounter("IndexerStatus", "deleted (gone)").increment(1);
+ context.getCounter(NutchMetrics.GROUP_INDEXER,
+ NutchMetrics.INDEXER_DELETED_GONE_TOTAL).increment(1);
context.write(key, DELETE_ACTION);
return;
}
@@ -309,7 +312,8 @@ public class IndexerMapReduce extends Configured {
|| fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP
|| dbDatum != null && dbDatum.getStatus() ==
CrawlDatum.STATUS_DB_REDIR_PERM
|| dbDatum != null && dbDatum.getStatus() ==
CrawlDatum.STATUS_DB_REDIR_TEMP) {
- context.getCounter("IndexerStatus", "deleted
(redirects)").increment(1);
+ context.getCounter(NutchMetrics.GROUP_INDEXER,
+ NutchMetrics.INDEXER_DELETED_REDIRECTS_TOTAL).increment(1);
context.write(key, DELETE_ACTION);
return;
}
@@ -321,14 +325,16 @@ public class IndexerMapReduce extends Configured {
// Whether to delete pages marked as duplicates
if (delete && dbDatum != null && dbDatum.getStatus() ==
CrawlDatum.STATUS_DB_DUPLICATE) {
- context.getCounter("IndexerStatus", "deleted
(duplicates)").increment(1);
+ context.getCounter(NutchMetrics.GROUP_INDEXER,
+ NutchMetrics.INDEXER_DELETED_DUPLICATES_TOTAL).increment(1);
context.write(key, DELETE_ACTION);
return;
}
// Whether to skip DB_NOTMODIFIED pages
if (skip && dbDatum != null && dbDatum.getStatus() ==
CrawlDatum.STATUS_DB_NOTMODIFIED) {
- context.getCounter("IndexerStatus", "skipped (not
modified)").increment(1);
+ context.getCounter(NutchMetrics.GROUP_INDEXER,
+ NutchMetrics.INDEXER_SKIPPED_NOT_MODIFIED_TOTAL).increment(1);
return;
}
@@ -355,7 +361,8 @@ public class IndexerMapReduce extends Configured {
boost = scfilters.indexerScore(key, doc, dbDatum, fetchDatum, parse,
inlinks, boost);
} catch (final ScoringFilterException e) {
- context.getCounter("IndexerStatus", "errors
(ScoringFilter)").increment(1);
+ context.getCounter(NutchMetrics.GROUP_INDEXER,
+ NutchMetrics.INDEXER_ERRORS_SCORING_FILTER_TOTAL).increment(1);
LOG.warn("Error calculating score {}: {}", key, e);
return;
}
@@ -390,7 +397,8 @@ public class IndexerMapReduce extends Configured {
doc = filters.filter(doc, parse, key, fetchDatum, inlinks);
} catch (final IndexingException e) {
LOG.warn("Error indexing {}: ", key, e);
- context.getCounter("IndexerStatus", "errors
(IndexingFilter)").increment(1);
+ context.getCounter(NutchMetrics.GROUP_INDEXER,
+ NutchMetrics.INDEXER_ERRORS_INDEXING_FILTER_TOTAL).increment(1);
return;
}
@@ -400,9 +408,11 @@ public class IndexerMapReduce extends Configured {
if (deleteSkippedByIndexingFilter) {
NutchIndexAction action = new NutchIndexAction(null,
NutchIndexAction.DELETE);
context.write(key, action);
- context.getCounter("IndexerStatus", "deleted
(IndexingFilter)").increment(1);
+ context.getCounter(NutchMetrics.GROUP_INDEXER,
+
NutchMetrics.INDEXER_DELETED_BY_INDEXING_FILTER_TOTAL).increment(1);
} else {
- context.getCounter("IndexerStatus", "skipped
(IndexingFilter)").increment(1);
+ context.getCounter(NutchMetrics.GROUP_INDEXER,
+
NutchMetrics.INDEXER_SKIPPED_BY_INDEXING_FILTER_TOTAL).increment(1);
}
return;
}
@@ -422,7 +432,8 @@ public class IndexerMapReduce extends Configured {
doc.add("binaryContent", binary);
}
- context.getCounter("IndexerStatus", "indexed (add/update)").increment(1);
+ context.getCounter(NutchMetrics.GROUP_INDEXER,
+ NutchMetrics.INDEXER_INDEXED_TOTAL).increment(1);
NutchIndexAction action = new NutchIndexAction(doc,
NutchIndexAction.ADD);
context.write(key, action);
diff --git a/src/java/org/apache/nutch/metrics/NutchMetrics.java
b/src/java/org/apache/nutch/metrics/NutchMetrics.java
new file mode 100644
index 000000000..e64a8d6d0
--- /dev/null
+++ b/src/java/org/apache/nutch/metrics/NutchMetrics.java
@@ -0,0 +1,371 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metrics;
+
+/**
+ * Centralized constants for Hadoop metrics counter groups and names.
+ *
+ * <p>Follows <a href="https://prometheus.io/docs/practices/naming/">Prometheus
+ * naming conventions</a>:
+ * <ul>
+ * <li>Counter groups use the {@code nutch_} prefix namespace</li>
+ * <li>Counter names use snake_case</li>
+ * <li>Accumulating counters use {@code _total} suffix</li>
+ * <li>Units are included in counter names where applicable (e.g., {@code
_bytes})</li>
+ * </ul>
+ *
+ * @since 1.22
+ */
+public final class NutchMetrics {
+
+ private NutchMetrics() {
+ // Utility class - prevent instantiation
+ }
+
+ // =========================================================================
+ // Counter Groups (Prometheus namespace style with nutch_ prefix)
+ // =========================================================================
+
+ /** Counter group for fetcher operations. */
+ public static final String GROUP_FETCHER = "nutch_fetcher";
+
+ /** Counter group for fetcher outlink processing. */
+ public static final String GROUP_FETCHER_OUTLINKS = "nutch_fetcher_outlinks";
+
+ /** Counter group for generator operations. */
+ public static final String GROUP_GENERATOR = "nutch_generator";
+
+ /** Counter group for indexer operations. */
+ public static final String GROUP_INDEXER = "nutch_indexer";
+
+ /** Counter group for CrawlDb operations. */
+ public static final String GROUP_CRAWLDB = "nutch_crawldb";
+
+ /** Counter group for CrawlDb filter operations. */
+ public static final String GROUP_CRAWLDB_FILTER = "nutch_crawldb_filter";
+
+ /** Counter group for injector operations. */
+ public static final String GROUP_INJECTOR = "nutch_injector";
+
+ /** Counter group for HostDb operations. */
+ public static final String GROUP_HOSTDB = "nutch_hostdb";
+
+ /** Counter group for parser operations. */
+ public static final String GROUP_PARSER = "nutch_parser";
+
+ /** Counter group for deduplication operations. */
+ public static final String GROUP_DEDUP = "nutch_dedup";
+
+ /** Counter group for cleaning job operations. */
+ public static final String GROUP_CLEANING = "nutch_cleaning";
+
+ /** Counter group for WebGraph operations. */
+ public static final String GROUP_WEBGRAPH = "nutch_webgraph";
+
+ /** Counter group for sitemap processing operations. */
+ public static final String GROUP_SITEMAP = "nutch_sitemap";
+
+ /** Counter group for WARC export operations. */
+ public static final String GROUP_WARC_EXPORTER = "nutch_warc_exporter";
+
+ /** Counter group for domain statistics operations. */
+ public static final String GROUP_DOMAIN_STATS = "nutch_domain_stats";
+
+ // =========================================================================
+ // Fetcher Counters
+ // =========================================================================
+
+ /** Total bytes downloaded by fetcher. */
+ public static final String FETCHER_BYTES_DOWNLOADED_TOTAL =
"bytes_downloaded_total";
+
+ /** URLs denied by robots.txt. */
+ public static final String FETCHER_ROBOTS_DENIED_TOTAL =
"robots_denied_total";
+
+ /** URLs denied due to crawl delay exceeding maximum. */
+ public static final String FETCHER_ROBOTS_DENIED_MAXCRAWLDELAY_TOTAL =
"robots_denied_maxcrawldelay_total";
+
+ /** URLs dropped due to robots.txt deferred visits. */
+ public static final String FETCHER_ROBOTS_DEFER_VISITS_DROPPED_TOTAL =
"robots_defer_visits_dropped_total";
+
+ /** Redirects that exceeded maximum redirect count. */
+ public static final String FETCHER_REDIRECT_COUNT_EXCEEDED_TOTAL =
"redirect_count_exceeded_total";
+
+ /** Redirects deduplicated (already seen). */
+ public static final String FETCHER_REDIRECT_DEDUPLICATED_TOTAL =
"redirect_deduplicated_total";
+
+ /** FetchItems not created for redirects. */
+ public static final String FETCHER_REDIRECT_NOT_CREATED_TOTAL =
"redirect_not_created_total";
+
+ /** URLs hit by time limit. */
+ public static final String FETCHER_HIT_BY_TIMELIMIT_TOTAL =
"hit_by_timelimit_total";
+
+ /** URLs hit by timeout. */
+ public static final String FETCHER_HIT_BY_TIMEOUT_TOTAL =
"hit_by_timeout_total";
+
+ /** URLs hit by throughput threshold. */
+ public static final String FETCHER_HIT_BY_THROUGHPUT_THRESHOLD_TOTAL =
"hit_by_throughput_threshold_total";
+
+ /** Threads that hung during fetching. */
+ public static final String FETCHER_HUNG_THREADS_TOTAL = "hung_threads_total";
+
+ /** URLs filtered during fetching. */
+ public static final String FETCHER_FILTERED_TOTAL = "filtered_total";
+
+ /** URLs dropped due to exception threshold in queue. */
+ public static final String FETCHER_ABOVE_EXCEPTION_THRESHOLD_TOTAL =
"above_exception_threshold_total";
+
+ // =========================================================================
+ // Fetcher Outlinks Counters
+ // =========================================================================
+
+ /** Outlinks detected during parsing. */
+ public static final String FETCHER_OUTLINKS_DETECTED_TOTAL =
"outlinks_detected_total";
+
+ /** Outlinks being followed. */
+ public static final String FETCHER_OUTLINKS_FOLLOWING_TOTAL =
"outlinks_following_total";
+
+ // =========================================================================
+ // Generator Counters
+ // =========================================================================
+
+ /** URLs rejected by URL filters. */
+ public static final String GENERATOR_URL_FILTERS_REJECTED_TOTAL =
"url_filters_rejected_total";
+
+ /** URL filter exceptions. */
+ public static final String GENERATOR_URL_FILTER_EXCEPTION_TOTAL =
"url_filter_exception_total";
+
+ /** URLs rejected by fetch schedule. */
+ public static final String GENERATOR_SCHEDULE_REJECTED_TOTAL =
"schedule_rejected_total";
+
+ /** URLs waiting for CrawlDb update. */
+ public static final String GENERATOR_WAIT_FOR_UPDATE_TOTAL =
"wait_for_update_total";
+
+ /** URLs rejected by JEXL expression. */
+ public static final String GENERATOR_EXPR_REJECTED_TOTAL =
"expr_rejected_total";
+
+ /** URLs rejected due to status restriction. */
+ public static final String GENERATOR_STATUS_REJECTED_TOTAL =
"status_rejected_total";
+
+ /** URLs rejected due to score below threshold. */
+ public static final String GENERATOR_SCORE_TOO_LOW_TOTAL =
"score_too_low_total";
+
+ /** URLs rejected due to fetch interval exceeding threshold. */
+ public static final String GENERATOR_INTERVAL_REJECTED_TOTAL =
"interval_rejected_total";
+
+ /** Malformed URLs encountered. */
+ public static final String GENERATOR_MALFORMED_URL_TOTAL =
"malformed_url_total";
+
+ /** URLs skipped due to per-host overflow. */
+ public static final String GENERATOR_URLS_SKIPPED_PER_HOST_OVERFLOW_TOTAL =
"urls_skipped_per_host_overflow_total";
+
+ /** Hosts affected by per-host overflow. */
+ public static final String GENERATOR_HOSTS_AFFECTED_PER_HOST_OVERFLOW_TOTAL
= "hosts_affected_per_host_overflow_total";
+
+ // =========================================================================
+ // Indexer Counters
+ // =========================================================================
+
+ /** Documents deleted due to robots noindex. */
+ public static final String INDEXER_DELETED_ROBOTS_NOINDEX_TOTAL =
"deleted_robots_noindex_total";
+
+ /** Documents deleted because they are gone. */
+ public static final String INDEXER_DELETED_GONE_TOTAL = "deleted_gone_total";
+
+ /** Documents deleted due to redirects. */
+ public static final String INDEXER_DELETED_REDIRECTS_TOTAL =
"deleted_redirects_total";
+
+ /** Documents deleted as duplicates. */
+ public static final String INDEXER_DELETED_DUPLICATES_TOTAL =
"deleted_duplicates_total";
+
+ /** Documents deleted by indexing filter. */
+ public static final String INDEXER_DELETED_BY_INDEXING_FILTER_TOTAL =
"deleted_by_indexing_filter_total";
+
+ /** Documents skipped (not modified). */
+ public static final String INDEXER_SKIPPED_NOT_MODIFIED_TOTAL =
"skipped_not_modified_total";
+
+ /** Documents skipped by indexing filter. */
+ public static final String INDEXER_SKIPPED_BY_INDEXING_FILTER_TOTAL =
"skipped_by_indexing_filter_total";
+
+ /** Scoring filter errors. */
+ public static final String INDEXER_ERRORS_SCORING_FILTER_TOTAL =
"errors_scoring_filter_total";
+
+ /** Indexing filter errors. */
+ public static final String INDEXER_ERRORS_INDEXING_FILTER_TOTAL =
"errors_indexing_filter_total";
+
+ /** Documents indexed (added or updated). */
+ public static final String INDEXER_INDEXED_TOTAL = "indexed_total";
+
+ // =========================================================================
+ // CrawlDb Counters
+ // =========================================================================
+
+ /** URLs filtered during CrawlDb operations. */
+ public static final String CRAWLDB_URLS_FILTERED_TOTAL =
"urls_filtered_total";
+
+ /** Gone (404) records removed during CrawlDb operations. */
+ public static final String CRAWLDB_GONE_RECORDS_REMOVED_TOTAL =
"gone_records_removed_total";
+
+ /** Orphan records removed during CrawlDb operations. */
+ public static final String CRAWLDB_ORPHAN_RECORDS_REMOVED_TOTAL =
"orphan_records_removed_total";
+
+ // =========================================================================
+ // Injector Counters
+ // =========================================================================
+
+ /** URLs filtered during injection. */
+ public static final String INJECTOR_URLS_FILTERED_TOTAL =
"urls_filtered_total";
+
+ /** URLs injected. */
+ public static final String INJECTOR_URLS_INJECTED_TOTAL =
"urls_injected_total";
+
+ /** Unique URLs injected. */
+ public static final String INJECTOR_URLS_INJECTED_UNIQUE_TOTAL =
"urls_injected_unique_total";
+
+ /** URLs merged with existing CrawlDb entries. */
+ public static final String INJECTOR_URLS_MERGED_TOTAL = "urls_merged_total";
+
+ /** URLs purged due to 404 status. */
+ public static final String INJECTOR_URLS_PURGED_404_TOTAL =
"urls_purged_404_total";
+
+ /** URLs purged by filter. */
+ public static final String INJECTOR_URLS_PURGED_FILTER_TOTAL =
"urls_purged_filter_total";
+
+ // =========================================================================
+ // HostDb Counters
+ // =========================================================================
+
+ /** Malformed URLs in HostDb. */
+ public static final String HOSTDB_MALFORMED_URL_TOTAL =
"malformed_url_total";
+
+ /** Records filtered in HostDb. */
+ public static final String HOSTDB_FILTERED_RECORDS_TOTAL =
"filtered_records_total";
+
+ /** Total hosts processed. */
+ public static final String HOSTDB_TOTAL_HOSTS_TOTAL = "total_hosts_total";
+
+ /** Hosts skipped (not eligible). */
+ public static final String HOSTDB_SKIPPED_NOT_ELIGIBLE_TOTAL =
"skipped_not_eligible_total";
+
+ /** Hosts where URL limit was not reached. */
+ public static final String HOSTDB_URL_LIMIT_NOT_REACHED_TOTAL =
"url_limit_not_reached_total";
+
+ /** New known hosts discovered. */
+ public static final String HOSTDB_NEW_KNOWN_HOST_TOTAL =
"new_known_host_total";
+
+ /** Rediscovered hosts. */
+ public static final String HOSTDB_REDISCOVERED_HOST_TOTAL =
"rediscovered_host_total";
+
+ /** Existing known hosts. */
+ public static final String HOSTDB_EXISTING_KNOWN_HOST_TOTAL =
"existing_known_host_total";
+
+ /** New unknown hosts. */
+ public static final String HOSTDB_NEW_UNKNOWN_HOST_TOTAL =
"new_unknown_host_total";
+
+ /** Existing unknown hosts. */
+ public static final String HOSTDB_EXISTING_UNKNOWN_HOST_TOTAL =
"existing_unknown_host_total";
+
+ /** Purged unknown hosts. */
+ public static final String HOSTDB_PURGED_UNKNOWN_HOST_TOTAL =
"purged_unknown_host_total";
+
+ /** Hosts checked. */
+ public static final String HOSTDB_CHECKED_HOSTS_TOTAL =
"checked_hosts_total";
+
+ // =========================================================================
+ // Deduplication Counters
+ // =========================================================================
+
+ /** Documents marked as duplicate. */
+ public static final String DEDUP_DOCUMENTS_MARKED_DUPLICATE_TOTAL =
"documents_marked_duplicate_total";
+
+ // =========================================================================
+ // Cleaning Job Counters
+ // =========================================================================
+
+ /** Documents deleted during cleaning. */
+ public static final String CLEANING_DELETED_DOCUMENTS_TOTAL =
"deleted_documents_total";
+
+ // =========================================================================
+ // WebGraph Counters
+ // =========================================================================
+
+ /** Links added to WebGraph. */
+ public static final String WEBGRAPH_ADDED_LINKS_TOTAL = "added_links_total";
+
+ /** Links removed from WebGraph. */
+ public static final String WEBGRAPH_REMOVED_LINKS_TOTAL =
"removed_links_total";
+
+ // =========================================================================
+ // Sitemap Counters
+ // =========================================================================
+
+ /** Filtered records in sitemap processing. */
+ public static final String SITEMAP_FILTERED_RECORDS_TOTAL =
"filtered_records_total";
+
+ /** Seeds extracted from sitemaps. */
+ public static final String SITEMAP_SEEDS_TOTAL = "sitemap_seeds_total";
+
+ /** Sitemaps discovered from hostname. */
+ public static final String SITEMAP_FROM_HOSTNAME_TOTAL =
"sitemaps_from_hostname_total";
+
+ /** Sitemaps filtered from hostname. */
+ public static final String SITEMAP_FILTERED_FROM_HOSTNAME_TOTAL =
"filtered_sitemaps_from_hostname_total";
+
+ /** Failed sitemap fetches. */
+ public static final String SITEMAP_FAILED_FETCHES_TOTAL =
"failed_fetches_total";
+
+ /** Existing sitemap entries. */
+ public static final String SITEMAP_EXISTING_ENTRIES_TOTAL =
"existing_sitemap_entries_total";
+
+ /** New sitemap entries. */
+ public static final String SITEMAP_NEW_ENTRIES_TOTAL =
"new_sitemap_entries_total";
+
+ // =========================================================================
+ // WARC Exporter Counters
+ // =========================================================================
+
+ /** Missing content in WARC export. */
+ public static final String WARC_MISSING_CONTENT_TOTAL =
"missing_content_total";
+
+ /** Missing metadata in WARC export. */
+ public static final String WARC_MISSING_METADATA_TOTAL =
"missing_metadata_total";
+
+ /** Omitted empty responses in WARC export. */
+ public static final String WARC_OMITTED_EMPTY_RESPONSE_TOTAL =
"omitted_empty_response_total";
+
+ /** Invalid URIs in WARC export. */
+ public static final String WARC_INVALID_URI_TOTAL = "invalid_uri_total";
+
+ /** WARC records generated. */
+ public static final String WARC_RECORDS_GENERATED_TOTAL =
"records_generated_total";
+
+ /** Exceptions during WARC export. */
+ public static final String WARC_EXCEPTION_TOTAL = "exception_total";
+
+ // =========================================================================
+ // Domain Statistics Counters (enum-based, kept for compatibility)
+ // =========================================================================
+
+ /** Fetched URLs in domain statistics. */
+ public static final String DOMAIN_STATS_FETCHED_TOTAL = "fetched_total";
+
+ /** Not fetched URLs in domain statistics. */
+ public static final String DOMAIN_STATS_NOT_FETCHED_TOTAL =
"not_fetched_total";
+
+ /** Empty results in domain statistics. */
+ public static final String DOMAIN_STATS_EMPTY_RESULT_TOTAL =
"empty_result_total";
+}
+
diff --git a/src/java/org/apache/nutch/metrics/package-info.java
b/src/java/org/apache/nutch/metrics/package-info.java
new file mode 100644
index 000000000..376605d04
--- /dev/null
+++ b/src/java/org/apache/nutch/metrics/package-info.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Metrics infrastructure for Apache Nutch.
+ *
+ * <p>This package provides centralized constants and utilities for Hadoop
+ * MapReduce metrics/counters following
+ * <a href="https://prometheus.io/docs/practices/naming/">Prometheus naming
+ * conventions</a>.
+ *
+ * <p>The main class is {@link org.apache.nutch.metrics.NutchMetrics} which
+ * defines all counter group names and counter names as constants.
+ *
+ * @since 1.22
+ */
+package org.apache.nutch.metrics;
+
diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java
b/src/java/org/apache/nutch/parse/ParseSegment.java
index 6b2fb5cee..5ec74ea9f 100644
--- a/src/java/org/apache/nutch/parse/ParseSegment.java
+++ b/src/java/org/apache/nutch/parse/ParseSegment.java
@@ -37,6 +37,7 @@ import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.scoring.ScoringFilterException;
@@ -129,7 +130,8 @@ public class ParseSegment extends NutchTool implements Tool
{
Parse parse = entry.getValue();
ParseStatus parseStatus = parse.getData().getStatus();
- context.getCounter("ParserStatus",
+ // Dynamic counter based on parse status
+ context.getCounter(NutchMetrics.GROUP_PARSER,
ParseStatus.majorCodes[parseStatus.getMajorCode()]).increment(1);
if (!parseStatus.isSuccess()) {
diff --git a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
index 4daefcd8f..0b728a588 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
@@ -58,6 +58,7 @@ import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Outlink;
@@ -361,14 +362,16 @@ public class WebGraph extends Configured implements Tool {
mostRecent = timestamp;
}
outlinkList.add(WritableUtils.clone(next, conf));
- context.getCounter("WebGraph.outlinks", "added
links").increment(1);
+ context.getCounter(NutchMetrics.GROUP_WEBGRAPH,
+ NutchMetrics.WEBGRAPH_ADDED_LINKS_TOTAL).increment(1);
} else if (value instanceof BooleanWritable) {
BooleanWritable delete = (BooleanWritable) value;
// Actually, delete is always true, otherwise we don't emit it in
the
// mapper in the first place
if (delete.get() == true) {
// This page is gone, do not emit it's outlinks
- context.getCounter("WebGraph.outlinks", "removed
links").increment(1);
+ context.getCounter(NutchMetrics.GROUP_WEBGRAPH,
+ NutchMetrics.WEBGRAPH_REMOVED_LINKS_TOTAL).increment(1);
return;
}
}
diff --git a/src/java/org/apache/nutch/tools/warc/WARCExporter.java
b/src/java/org/apache/nutch/tools/warc/WARCExporter.java
index bf824f9b3..df4f6af05 100644
--- a/src/java/org/apache/nutch/tools/warc/WARCExporter.java
+++ b/src/java/org/apache/nutch/tools/warc/WARCExporter.java
@@ -57,6 +57,7 @@ import org.apache.nutch.parse.ParseSegment;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.tools.WARCUtils;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
@@ -147,13 +148,15 @@ public class WARCExporter extends Configured implements
Tool {
// check that we have everything we need
if (content == null) {
LOG.info("Missing content for {}", key);
- context.getCounter("WARCExporter", "missing content").increment(1);
+ context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER,
+ NutchMetrics.WARC_MISSING_CONTENT_TOTAL).increment(1);
return;
}
if (cd == null) {
LOG.info("Missing fetch datum for {}", key);
- context.getCounter("WARCExporter", "missing metadata").increment(1);
+ context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER,
+ NutchMetrics.WARC_MISSING_METADATA_TOTAL).increment(1);
return;
}
@@ -161,8 +164,8 @@ public class WARCExporter extends Configured implements
Tool {
// Empty responses is everything that was not a regular response
if (!(cd.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS
|| cd.getStatus() == CrawlDatum.STATUS_FETCH_NOTMODIFIED)) {
- context.getCounter("WARCExporter", "omitted empty response")
- .increment(1);
+ context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER,
+ NutchMetrics.WARC_OMITTED_EMPTY_RESPONSE_TOTAL).increment(1);
return;
}
}
@@ -237,7 +240,8 @@ public class WARCExporter extends Configured implements
Tool {
.append(uri.toASCIIString()).append(CRLF);
} catch (Exception e) {
LOG.error("Invalid URI {} ", key);
- context.getCounter("WARCExporter", "invalid URI").increment(1);
+ context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER,
+ NutchMetrics.WARC_INVALID_URI_TOTAL).increment(1);
return;
}
@@ -269,12 +273,14 @@ public class WARCExporter extends Configured implements
Tool {
new ByteArrayInputStream(bos.toByteArray()));
WARCRecord record = new WARCRecord(in);
context.write(NullWritable.get(), new WARCWritable(record));
- context.getCounter("WARCExporter", "records generated").increment(1);
+ context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER,
+ NutchMetrics.WARC_RECORDS_GENERATED_TOTAL).increment(1);
} catch (IOException | IllegalStateException exception) {
LOG.error(
"Exception when generating WARC resource record for {} : {}",
key,
exception.getMessage());
- context.getCounter("WARCExporter", "exception").increment(1);
+ context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER,
+ NutchMetrics.WARC_EXCEPTION_TOTAL).increment(1);
}
// Do we need to emit a metadata record too?
@@ -316,7 +322,8 @@ public class WARCExporter extends Configured implements
Tool {
.append(uri.toASCIIString()).append(CRLF);
} catch (Exception e) {
LOG.error("Invalid URI {} ", key);
- context.getCounter("WARCExporter", "invalid URI").increment(1);
+ context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER,
+ NutchMetrics.WARC_INVALID_URI_TOTAL).increment(1);
return;
}
@@ -332,13 +339,14 @@ public class WARCExporter extends Configured implements
Tool {
new ByteArrayInputStream(bos.toByteArray()));
WARCRecord record = new WARCRecord(in);
context.write(NullWritable.get(), new WARCWritable(record));
- context.getCounter("WARCExporter", "records generated")
- .increment(1);
+ context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER,
+ NutchMetrics.WARC_RECORDS_GENERATED_TOTAL).increment(1);
} catch (IOException | IllegalStateException exception) {
LOG.error(
"Exception when generating WARC metadata record for {} : {}",
key, exception.getMessage(), exception);
- context.getCounter("WARCExporter", "exception").increment(1);
+ context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER,
+ NutchMetrics.WARC_EXCEPTION_TOTAL).increment(1);
}
}
@@ -376,7 +384,8 @@ public class WARCExporter extends Configured implements
Tool {
.append(uri.toASCIIString()).append(CRLF);
} catch (Exception e) {
LOG.error("Invalid URI {} ", key);
- context.getCounter("WARCExporter", "invalid URI").increment(1);
+ context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER,
+ NutchMetrics.WARC_INVALID_URI_TOTAL).increment(1);
return;
}
@@ -392,13 +401,14 @@ public class WARCExporter extends Configured implements
Tool {
new ByteArrayInputStream(bos.toByteArray()));
WARCRecord record = new WARCRecord(in);
context.write(NullWritable.get(), new WARCWritable(record));
- context.getCounter("WARCExporter", "records generated")
- .increment(1);
+ context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER,
+ NutchMetrics.WARC_RECORDS_GENERATED_TOTAL).increment(1);
} catch (IOException | IllegalStateException exception) {
LOG.error(
"Exception when generating WARC metadata record for {} : {}",
key, exception.getMessage(), exception);
- context.getCounter("WARCExporter", "exception").increment(1);
+ context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER,
+ NutchMetrics.WARC_EXCEPTION_TOTAL).increment(1);
}
}
}
diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java
b/src/java/org/apache/nutch/util/SitemapProcessor.java
index d83a6e358..7055a6d86 100644
--- a/src/java/org/apache/nutch/util/SitemapProcessor.java
+++ b/src/java/org/apache/nutch/util/SitemapProcessor.java
@@ -45,6 +45,7 @@ import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.hostdb.HostDatum;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.protocol.Content;
@@ -161,11 +162,13 @@ public class SitemapProcessor extends Configured
implements Tool {
url.startsWith("file:/")) {
// For entry from sitemap urls file, fetch the sitemap, extract
urls and emit those
if((url = filterNormalize(url)) == null) {
- context.getCounter("Sitemap", "filtered_records").increment(1);
+ context.getCounter(NutchMetrics.GROUP_SITEMAP,
+ NutchMetrics.SITEMAP_FILTERED_RECORDS_TOTAL).increment(1);
return;
}
- context.getCounter("Sitemap", "sitemap_seeds").increment(1);
+ context.getCounter(NutchMetrics.GROUP_SITEMAP,
+ NutchMetrics.SITEMAP_SEEDS_TOTAL).increment(1);
generateSitemapUrlDatum(protocolFactory.getProtocol(url), url,
context);
} else {
LOG.info("generateSitemapsFromHostname: {}", key.toString());
@@ -203,7 +206,8 @@ public class SitemapProcessor extends Configured implements
Tool {
(url = filterNormalize("https://" + host + "/")) == null &&
(url = filterNormalize("ftp://" + host + "/")) == null &&
(url = filterNormalize("file:/" + host + "/")) == null) {
- context.getCounter("Sitemap", "filtered_records").increment(1);
+ context.getCounter(NutchMetrics.GROUP_SITEMAP,
+ NutchMetrics.SITEMAP_FILTERED_RECORDS_TOTAL).increment(1);
return;
}
// We may wish to use the robots.txt content as the third parameter
for .getRobotRules
@@ -214,11 +218,12 @@ public class SitemapProcessor extends Configured
implements Tool {
sitemaps.add(url + "sitemap.xml");
}
for (String sitemap : sitemaps) {
- context.getCounter("Sitemap", "sitemaps_from_hostname").increment(1);
+ context.getCounter(NutchMetrics.GROUP_SITEMAP,
+ NutchMetrics.SITEMAP_FROM_HOSTNAME_TOTAL).increment(1);
sitemap = filterNormalize(sitemap);
if (sitemap == null) {
- context.getCounter("Sitemap", "filtered_sitemaps_from_hostname")
- .increment(1);
+ context.getCounter(NutchMetrics.GROUP_SITEMAP,
+
NutchMetrics.SITEMAP_FILTERED_FROM_HOSTNAME_TOTAL).increment(1);
} else {
generateSitemapUrlDatum(protocolFactory.getProtocol(sitemap),
sitemap, context);
@@ -254,7 +259,8 @@ public class SitemapProcessor extends Configured implements
Tool {
if(status.getCode() != ProtocolStatus.SUCCESS) {
// If there were any problems fetching the sitemap, log the error and
let it go. Not sure how often
// sitemaps are redirected. In future we might have to handle
redirects.
- context.getCounter("Sitemap", "failed_fetches").increment(1);
+ context.getCounter(NutchMetrics.GROUP_SITEMAP,
+ NutchMetrics.SITEMAP_FAILED_FETCHES_TOTAL).increment(1);
LOG.error("Error while fetching the sitemap. Status code: {} for {}",
status.getCode(), url);
return;
}
@@ -373,12 +379,14 @@ public class SitemapProcessor extends Configured
implements Tool {
originalDatum.setModifiedTime(sitemapDatum.getModifiedTime());
}
- context.getCounter("Sitemap", "existing_sitemap_entries").increment(1);
+ context.getCounter(NutchMetrics.GROUP_SITEMAP,
+ NutchMetrics.SITEMAP_EXISTING_ENTRIES_TOTAL).increment(1);
context.write(key, originalDatum);
}
else if(sitemapDatum != null) {
// For the newly discovered links via sitemap, set the status as
unfetched and emit
- context.getCounter("Sitemap", "new_sitemap_entries").increment(1);
+ context.getCounter(NutchMetrics.GROUP_SITEMAP,
+ NutchMetrics.SITEMAP_NEW_ENTRIES_TOTAL).increment(1);
sitemapDatum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
context.write(key, sitemapDatum);
}