This is an automated email from the ASF dual-hosted git repository.
lewismc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new f7c7e1a03 NUTCH-3150 Expand Caching Hadoop Counter References (#892)
f7c7e1a03 is described below
commit f7c7e1a03cf46cdbbedfe923c0d7a97c34fbf2aa
Author: Lewis John McGibbney <[email protected]>
AuthorDate: Tue Feb 10 11:09:16 2026 -0800
NUTCH-3150 Expand Caching Hadoop Counter References (#892)
---
src/java/org/apache/nutch/crawl/CrawlDbFilter.java | 30 ++++++--
.../org/apache/nutch/crawl/CrawlDbReducer.java | 21 ++++--
.../org/apache/nutch/crawl/DeduplicationJob.java | 17 ++++-
src/java/org/apache/nutch/crawl/Generator.java | 80 +++++++++++++++++-----
src/java/org/apache/nutch/crawl/Injector.java | 58 ++++++++++++----
src/java/org/apache/nutch/fetcher/Fetcher.java | 41 ++++++++---
.../org/apache/nutch/hostdb/ResolverThread.java | 69 +++++++++++++------
.../apache/nutch/hostdb/UpdateHostDbMapper.java | 11 ++-
.../apache/nutch/hostdb/UpdateHostDbReducer.java | 7 ++
src/java/org/apache/nutch/indexer/CleaningJob.java | 18 ++++-
.../apache/nutch/scoring/webgraph/WebGraph.java | 23 +++++--
.../org/apache/nutch/tools/warc/WARCExporter.java | 11 ++-
.../org/apache/nutch/util/DomainStatistics.java | 31 +++++++--
.../org/apache/nutch/util/SitemapProcessor.java | 18 ++++-
14 files changed, 343 insertions(+), 92 deletions(-)
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
index 7f28a3a85..912c6e4ab 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
@@ -22,6 +22,7 @@ import java.lang.invoke.MethodHandles;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metrics.NutchMetrics;
@@ -50,6 +51,11 @@ public class CrawlDbFilter extends
private String scope;
+ // Cached counter references for performance
+ private Counter goneRecordsRemovedCounter;
+ private Counter orphanRecordsRemovedCounter;
+ private Counter urlsFilteredCounter;
+
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
@@ -68,6 +74,21 @@ public class CrawlDbFilter extends
scope = conf.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_CRAWLDB);
normalizers = new URLNormalizers(conf, scope);
}
+
+ // Initialize cached counter references
+ initCounters(context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot
paths.
+ */
+ private void initCounters(Context context) {
+ goneRecordsRemovedCounter = context.getCounter(
+ NutchMetrics.GROUP_CRAWLDB_FILTER,
NutchMetrics.CRAWLDB_GONE_RECORDS_REMOVED_TOTAL);
+ orphanRecordsRemovedCounter = context.getCounter(
+ NutchMetrics.GROUP_CRAWLDB_FILTER,
NutchMetrics.CRAWLDB_ORPHAN_RECORDS_REMOVED_TOTAL);
+ urlsFilteredCounter = context.getCounter(
+ NutchMetrics.GROUP_CRAWLDB_FILTER,
NutchMetrics.CRAWLDB_URLS_FILTERED_TOTAL);
}
private Text newKey = new Text();
@@ -81,15 +102,13 @@ public class CrawlDbFilter extends
// https://issues.apache.org/jira/browse/NUTCH-1101 check status first,
// cheaper than normalizing or filtering
if (url404Purging && CrawlDatum.STATUS_DB_GONE == value.getStatus()) {
- context.getCounter(NutchMetrics.GROUP_CRAWLDB_FILTER,
- NutchMetrics.CRAWLDB_GONE_RECORDS_REMOVED_TOTAL).increment(1);
+ goneRecordsRemovedCounter.increment(1);
return;
}
// Whether to remove orphaned pages
// https://issues.apache.org/jira/browse/NUTCH-1932
if (purgeOrphans && CrawlDatum.STATUS_DB_ORPHAN == value.getStatus()) {
- context.getCounter(NutchMetrics.GROUP_CRAWLDB_FILTER,
- NutchMetrics.CRAWLDB_ORPHAN_RECORDS_REMOVED_TOTAL).increment(1);
+ orphanRecordsRemovedCounter.increment(1);
return;
}
if (url != null && urlNormalizers) {
@@ -109,8 +128,7 @@ public class CrawlDbFilter extends
}
}
if (url == null) {
- context.getCounter(NutchMetrics.GROUP_CRAWLDB_FILTER,
- NutchMetrics.CRAWLDB_URLS_FILTERED_TOTAL).increment(1);
+ urlsFilteredCounter.increment(1);
} else {
// URL has passed filters
newKey.set(url); // collect it
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
b/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
index 3ba173447..345411657 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
@@ -18,13 +18,16 @@ package org.apache.nutch.crawl;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
import java.util.Map.Entry;
import java.io.IOException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
@@ -52,6 +55,9 @@ public class CrawlDbReducer extends
private FetchSchedule schedule;
private ErrorTracker errorTracker;
+ // Cached counter references for status-based metrics
+ private Map<Byte, Counter> statusCounters = new HashMap<>();
+
@Override
public void setup(Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context
context) {
Configuration conf = context.getConfiguration();
@@ -66,6 +72,15 @@ public class CrawlDbReducer extends
errorTracker = new ErrorTracker(NutchMetrics.GROUP_CRAWLDB, context);
}
+ /**
+ * Get counter for status, caching for subsequent lookups.
+ */
+ private Counter getStatusCounter(byte status, Context context) {
+ return statusCounters.computeIfAbsent(status,
+ s -> context.getCounter(NutchMetrics.GROUP_CRAWLDB,
+ CrawlDatum.getStatusName(s)));
+ }
+
@Override
public void reduce(Text key, Iterable<CrawlDatum> values,
Context context) throws IOException, InterruptedException {
@@ -170,8 +185,7 @@ public class CrawlDbReducer extends
}
context.write(key, old);
// Dynamic counter based on status name
- context.getCounter(NutchMetrics.GROUP_CRAWLDB,
- CrawlDatum.getStatusName(old.getStatus())).increment(1);
+ getStatusCounter(old.getStatus(), context).increment(1);
} else {
LOG.warn("Missing fetch and old value, signature={}",
StringUtil.toHexString(signature));
@@ -329,8 +343,7 @@ public class CrawlDbReducer extends
result.getMetaData().remove(Nutch.WRITABLE_GENERATE_TIME_KEY);
context.write(key, result);
// Dynamic counter based on status name
- context.getCounter(NutchMetrics.GROUP_CRAWLDB,
- CrawlDatum.getStatusName(result.getStatus())).increment(1);
+ getStatusCounter(result.getStatus(), context).increment(1);
}
}
diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java
b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
index d5f983a27..50aa4cd7b 100644
--- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java
+++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
@@ -128,11 +128,25 @@ public class DeduplicationJob extends NutchTool
implements Tool {
protected String[] compareOrder;
+ // Cached counter reference for performance
+ private Counter documentsMarkedDuplicateCounter;
+
@Override
public void setup(
Reducer<K, CrawlDatum, Text, CrawlDatum>.Context context) {
Configuration conf = context.getConfiguration();
compareOrder = conf.get(DEDUPLICATION_COMPARE_ORDER).split(",");
+
+ // Initialize cached counter reference
+ initCounters(context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot
paths.
+ */
+ private void initCounters(Context context) {
+ documentsMarkedDuplicateCounter = context.getCounter(
+ NutchMetrics.GROUP_DEDUP,
NutchMetrics.DEDUP_DOCUMENTS_MARKED_DUPLICATE_TOTAL);
}
protected void writeOutAsDuplicate(CrawlDatum datum,
@@ -140,8 +154,7 @@ public class DeduplicationJob extends NutchTool implements
Tool {
throws IOException, InterruptedException {
datum.setStatus(CrawlDatum.STATUS_DB_DUPLICATE);
Text key = (Text) datum.getMetaData().remove(urlKey);
- context.getCounter(NutchMetrics.GROUP_DEDUP,
- NutchMetrics.DEDUP_DOCUMENTS_MARKED_DUPLICATE_TOTAL).increment(1);
+ documentsMarkedDuplicateCounter.increment(1);
context.write(key, datum);
}
diff --git a/src/java/org/apache/nutch/crawl/Generator.java
b/src/java/org/apache/nutch/crawl/Generator.java
index 456ba689a..57bf7f476 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -194,6 +194,17 @@ public class Generator extends NutchTool implements Tool {
private JexlScript expr = null;
private ErrorTracker errorTracker;
+ // Cached counter references for performance
+ private Counter urlFiltersRejectedCounter;
+ private Counter scheduleRejectedCounter;
+ private Counter waitForUpdateCounter;
+ private Counter exprRejectedCounter;
+ private Counter statusRejectedCounter;
+ private Counter scoreTooLowCounter;
+ private Counter intervalRejectedCounter;
+ private Counter hostsAffectedPerHostOverflowCounter;
+ private Counter urlsSkippedPerHostOverflowCounter;
+
@Override
public void setup(
Mapper<Text, CrawlDatum, FloatWritable, SelectorEntry>.Context context)
@@ -219,6 +230,32 @@ public class Generator extends NutchTool implements Tool {
expr = JexlUtil.parseExpression(conf.get(GENERATOR_EXPR, null));
// Initialize error tracker with cached counters
errorTracker = new ErrorTracker(NutchMetrics.GROUP_GENERATOR, context);
+ // Initialize cached counter references
+ initCounters(context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot
paths.
+ */
+ private void initCounters(Context context) {
+ urlFiltersRejectedCounter = context.getCounter(
+ NutchMetrics.GROUP_GENERATOR,
NutchMetrics.GENERATOR_URL_FILTERS_REJECTED_TOTAL);
+ scheduleRejectedCounter = context.getCounter(
+ NutchMetrics.GROUP_GENERATOR,
NutchMetrics.GENERATOR_SCHEDULE_REJECTED_TOTAL);
+ waitForUpdateCounter = context.getCounter(
+ NutchMetrics.GROUP_GENERATOR,
NutchMetrics.GENERATOR_WAIT_FOR_UPDATE_TOTAL);
+ exprRejectedCounter = context.getCounter(
+ NutchMetrics.GROUP_GENERATOR,
NutchMetrics.GENERATOR_EXPR_REJECTED_TOTAL);
+ statusRejectedCounter = context.getCounter(
+ NutchMetrics.GROUP_GENERATOR,
NutchMetrics.GENERATOR_STATUS_REJECTED_TOTAL);
+ scoreTooLowCounter = context.getCounter(
+ NutchMetrics.GROUP_GENERATOR,
NutchMetrics.GENERATOR_SCORE_TOO_LOW_TOTAL);
+ intervalRejectedCounter = context.getCounter(
+ NutchMetrics.GROUP_GENERATOR,
NutchMetrics.GENERATOR_INTERVAL_REJECTED_TOTAL);
+ hostsAffectedPerHostOverflowCounter = context.getCounter(
+ NutchMetrics.GROUP_GENERATOR,
NutchMetrics.GENERATOR_HOSTS_AFFECTED_PER_HOST_OVERFLOW_TOTAL);
+ urlsSkippedPerHostOverflowCounter = context.getCounter(
+ NutchMetrics.GROUP_GENERATOR,
NutchMetrics.GENERATOR_URLS_SKIPPED_PER_HOST_OVERFLOW_TOTAL);
}
@Override
@@ -230,8 +267,7 @@ public class Generator extends NutchTool implements Tool {
// URLFilters
try {
if (filters.filter(url.toString()) == null) {
- context.getCounter(NutchMetrics.GROUP_GENERATOR,
-
NutchMetrics.GENERATOR_URL_FILTERS_REJECTED_TOTAL).increment(1);
+ urlFiltersRejectedCounter.increment(1);
return;
}
} catch (URLFilterException e) {
@@ -245,8 +281,7 @@ public class Generator extends NutchTool implements Tool {
if (!schedule.shouldFetch(url, crawlDatum, curTime)) {
LOG.debug("-shouldFetch rejected '{}', fetchTime={}, curTime={}", url,
crawlDatum.getFetchTime(), curTime);
- context.getCounter(NutchMetrics.GROUP_GENERATOR,
- NutchMetrics.GENERATOR_SCHEDULE_REJECTED_TOTAL).increment(1);
+ scheduleRejectedCounter.increment(1);
return;
}
@@ -255,8 +290,7 @@ public class Generator extends NutchTool implements Tool {
if (oldGenTime != null) { // awaiting fetch & update
if (oldGenTime.get() + genDelay > curTime) { // still wait for
// update
- context.getCounter(NutchMetrics.GROUP_GENERATOR,
- NutchMetrics.GENERATOR_WAIT_FOR_UPDATE_TOTAL).increment(1);
+ waitForUpdateCounter.increment(1);
return;
}
}
@@ -271,22 +305,19 @@ public class Generator extends NutchTool implements Tool {
// check expr
if (expr != null) {
if (!crawlDatum.execute(expr, key.toString())) {
- context.getCounter(NutchMetrics.GROUP_GENERATOR,
- NutchMetrics.GENERATOR_EXPR_REJECTED_TOTAL).increment(1);
+ exprRejectedCounter.increment(1);
return;
}
}
if (restrictStatus != -1 && restrictStatus != crawlDatum.getStatus()) {
- context.getCounter(NutchMetrics.GROUP_GENERATOR,
- NutchMetrics.GENERATOR_STATUS_REJECTED_TOTAL).increment(1);
+ statusRejectedCounter.increment(1);
return;
}
// consider only entries with a score superior to the threshold
if (!Float.isNaN(scoreThreshold) && sort < scoreThreshold) {
- context.getCounter(NutchMetrics.GROUP_GENERATOR,
- NutchMetrics.GENERATOR_SCORE_TOO_LOW_TOTAL).increment(1);
+ scoreTooLowCounter.increment(1);
return;
}
@@ -294,8 +325,7 @@ public class Generator extends NutchTool implements Tool {
// threshold
if (intervalThreshold != -1
&& crawlDatum.getFetchInterval() > intervalThreshold) {
- context.getCounter(NutchMetrics.GROUP_GENERATOR,
- NutchMetrics.GENERATOR_INTERVAL_REJECTED_TOTAL).increment(1);
+ intervalRejectedCounter.increment(1);
return;
}
@@ -332,6 +362,10 @@ public class Generator extends NutchTool implements Tool {
private Map<String, HostDatum> hostDatumCache = new HashMap<>();
private ErrorTracker errorTracker;
+ // Cached counter references for performance
+ private Counter hostsAffectedPerHostOverflowCounter;
+ private Counter urlsSkippedPerHostOverflowCounter;
+
public void readHostDb() throws IOException {
if (conf.get(GENERATOR_HOSTDB) == null) {
return;
@@ -426,10 +460,22 @@ public class Generator extends NutchTool implements Tool {
}
// Initialize error tracker with cached counters
errorTracker = new ErrorTracker(NutchMetrics.GROUP_GENERATOR, context);
+ // Initialize cached counter references
+ initReducerCounters(context);
readHostDb();
}
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot
paths.
+ */
+ private void initReducerCounters(Context context) {
+ hostsAffectedPerHostOverflowCounter = context.getCounter(
+ NutchMetrics.GROUP_GENERATOR,
NutchMetrics.GENERATOR_HOSTS_AFFECTED_PER_HOST_OVERFLOW_TOTAL);
+ urlsSkippedPerHostOverflowCounter = context.getCounter(
+ NutchMetrics.GROUP_GENERATOR,
NutchMetrics.GENERATOR_URLS_SKIPPED_PER_HOST_OVERFLOW_TOTAL);
+ }
+
@Override
public void cleanup(Context context)
throws IOException, InterruptedException {
@@ -555,15 +601,13 @@ public class Generator extends NutchTool implements Tool {
hostCount[1] = 1;
} else {
if (hostCount[1] == (maxCount+1)) {
- context.getCounter(NutchMetrics.GROUP_GENERATOR,
-
NutchMetrics.GENERATOR_HOSTS_AFFECTED_PER_HOST_OVERFLOW_TOTAL).increment(1);
+ hostsAffectedPerHostOverflowCounter.increment(1);
LOG.info(
"Host or domain {} has more than {} URLs for all {}
segments. Additional URLs won't be included in the fetchlist.",
hostordomain, maxCount, maxNumSegments);
}
// skip this entry
- context.getCounter(NutchMetrics.GROUP_GENERATOR,
-
NutchMetrics.GENERATOR_URLS_SKIPPED_PER_HOST_OVERFLOW_TOTAL).increment(1);
+ urlsSkippedPerHostOverflowCounter.increment(1);
continue;
}
}
diff --git a/src/java/org/apache/nutch/crawl/Injector.java
b/src/java/org/apache/nutch/crawl/Injector.java
index f84366c2c..3fe4ce9ce 100644
--- a/src/java/org/apache/nutch/crawl/Injector.java
+++ b/src/java/org/apache/nutch/crawl/Injector.java
@@ -24,6 +24,7 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
@@ -130,6 +131,12 @@ public class Injector extends NutchTool implements Tool {
private boolean filterNormalizeAll = false;
private ErrorTracker errorTracker;
+ // Cached counter references for performance
+ private Counter urlsFilteredCounter;
+ private Counter urlsInjectedCounter;
+ private Counter urlsPurged404Counter;
+ private Counter urlsPurgedFilterCounter;
+
@Override
public void setup(Context context) {
Configuration conf = context.getConfiguration();
@@ -151,6 +158,22 @@ public class Injector extends NutchTool implements Tool {
url404Purging = conf.getBoolean(CrawlDb.CRAWLDB_PURGE_404, false);
// Initialize error tracker with cached counters
errorTracker = new ErrorTracker(NutchMetrics.GROUP_INJECTOR, context);
+ // Initialize cached counter references
+ initCounters(context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot
paths.
+ */
+ private void initCounters(Context context) {
+ urlsFilteredCounter = context.getCounter(
+ NutchMetrics.GROUP_INJECTOR,
NutchMetrics.INJECTOR_URLS_FILTERED_TOTAL);
+ urlsInjectedCounter = context.getCounter(
+ NutchMetrics.GROUP_INJECTOR,
NutchMetrics.INJECTOR_URLS_INJECTED_TOTAL);
+ urlsPurged404Counter = context.getCounter(
+ NutchMetrics.GROUP_INJECTOR,
NutchMetrics.INJECTOR_URLS_PURGED_404_TOTAL);
+ urlsPurgedFilterCounter = context.getCounter(
+ NutchMetrics.GROUP_INJECTOR,
NutchMetrics.INJECTOR_URLS_PURGED_FILTER_TOTAL);
}
/* Filter and normalize the input url */
@@ -223,8 +246,7 @@ public class Injector extends NutchTool implements Tool {
url = filterNormalize(url);
if (url == null) {
- context.getCounter(NutchMetrics.GROUP_INJECTOR,
- NutchMetrics.INJECTOR_URLS_FILTERED_TOTAL).increment(1);
+ urlsFilteredCounter.increment(1);
} else {
CrawlDatum datum = new CrawlDatum();
datum.setStatus(CrawlDatum.STATUS_INJECTED);
@@ -245,8 +267,7 @@ public class Injector extends NutchTool implements Tool {
url, e.getMessage());
errorTracker.incrementCounters(e);
}
- context.getCounter(NutchMetrics.GROUP_INJECTOR,
- NutchMetrics.INJECTOR_URLS_INJECTED_TOTAL).increment(1);
+ urlsInjectedCounter.increment(1);
context.write(key, datum);
}
} else if (value instanceof CrawlDatum) {
@@ -256,16 +277,14 @@ public class Injector extends NutchTool implements Tool {
// remove 404 urls
if (url404Purging && CrawlDatum.STATUS_DB_GONE == datum.getStatus()) {
- context.getCounter(NutchMetrics.GROUP_INJECTOR,
- NutchMetrics.INJECTOR_URLS_PURGED_404_TOTAL).increment(1);
+ urlsPurged404Counter.increment(1);
return;
}
if (filterNormalizeAll) {
String url = filterNormalize(key.toString());
if (url == null) {
- context.getCounter(NutchMetrics.GROUP_INJECTOR,
- NutchMetrics.INJECTOR_URLS_PURGED_FILTER_TOTAL).increment(1);
+ urlsPurgedFilterCounter.increment(1);
} else {
key.set(url);
context.write(key, datum);
@@ -287,6 +306,10 @@ public class Injector extends NutchTool implements Tool {
private CrawlDatum old = new CrawlDatum();
private CrawlDatum injected = new CrawlDatum();
+ // Cached counter references for performance
+ private Counter urlsInjectedUniqueCounter;
+ private Counter urlsMergedCounter;
+
@Override
public void setup(Context context) {
Configuration conf = context.getConfiguration();
@@ -296,6 +319,19 @@ public class Injector extends NutchTool implements Tool {
update = conf.getBoolean("db.injector.update", false);
LOG.info("Injector: overwrite: {}", overwrite);
LOG.info("Injector: update: {}", update);
+
+ // Initialize cached counter references
+ initCounters(context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot
paths.
+ */
+ private void initCounters(Context context) {
+ urlsInjectedUniqueCounter = context.getCounter(
+ NutchMetrics.GROUP_INJECTOR,
NutchMetrics.INJECTOR_URLS_INJECTED_UNIQUE_TOTAL);
+ urlsMergedCounter = context.getCounter(
+ NutchMetrics.GROUP_INJECTOR,
NutchMetrics.INJECTOR_URLS_MERGED_TOTAL);
}
/**
@@ -351,11 +387,9 @@ public class Injector extends NutchTool implements Tool {
}
}
if (injectedSet) {
- context.getCounter(NutchMetrics.GROUP_INJECTOR,
- NutchMetrics.INJECTOR_URLS_INJECTED_UNIQUE_TOTAL).increment(1);
+ urlsInjectedUniqueCounter.increment(1);
if (oldSet) {
- context.getCounter(NutchMetrics.GROUP_INJECTOR,
- NutchMetrics.INJECTOR_URLS_MERGED_TOTAL).increment(1);
+ urlsMergedCounter.increment(1);
}
}
context.write(key, result);
diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java
b/src/java/org/apache/nutch/fetcher/Fetcher.java
index 01144f493..029d95ff7 100644
--- a/src/java/org/apache/nutch/fetcher/Fetcher.java
+++ b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -34,6 +34,7 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
@@ -155,6 +156,13 @@ public class Fetcher extends NutchTool implements Tool {
private boolean storingContent;
private boolean parsing;
+ // Cached counter references for performance
+ private Counter bytesDownloadedCounter;
+ private Counter hitByThroughputThresholdCounter;
+ private Counter hitByTimelimitCounter;
+ private Counter hungThreadsCounter;
+ private Counter hitByTimeoutCounter;
+
private AtomicInteger getActiveThreads() {
return activeThreads;
}
@@ -193,11 +201,28 @@ public class Fetcher extends NutchTool implements Tool {
parsing = isParsing(conf);
}
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot
paths.
+ */
+ private void initCounters(Context context) {
+ bytesDownloadedCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER,
NutchMetrics.FETCHER_BYTES_DOWNLOADED_TOTAL);
+ hitByThroughputThresholdCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER,
NutchMetrics.FETCHER_HIT_BY_THROUGHPUT_THRESHOLD_TOTAL);
+ hitByTimelimitCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER,
NutchMetrics.FETCHER_HIT_BY_TIMELIMIT_TOTAL);
+ hungThreadsCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_HUNG_THREADS_TOTAL);
+ hitByTimeoutCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER,
NutchMetrics.FETCHER_HIT_BY_TIMEOUT_TOTAL);
+ }
+
@Override
public void run(Context innerContext)
throws IOException, InterruptedException {
setup(innerContext);
+ initCounters(innerContext);
try {
Configuration conf = innerContext.getConfiguration();
LinkedList<FetcherThread> fetcherThreads = new LinkedList<>();
@@ -292,8 +317,7 @@ public class Fetcher extends NutchTool implements Tool {
pagesLastSec = pages.get() - pagesLastSec;
bytesLastSec = (int) bytes.get() - bytesLastSec;
- innerContext.getCounter(NutchMetrics.GROUP_FETCHER,
-
NutchMetrics.FETCHER_BYTES_DOWNLOADED_TOTAL).increment(bytesLastSec);
+ bytesDownloadedCounter.increment(bytesLastSec);
reportStatus(innerContext, fetchQueues, pagesLastSec, bytesLastSec);
@@ -331,9 +355,7 @@ public class Fetcher extends NutchTool implements Tool {
int hitByThrougputThreshold = fetchQueues.emptyQueues();
if (hitByThrougputThreshold != 0)
- innerContext.getCounter(NutchMetrics.GROUP_FETCHER,
- NutchMetrics.FETCHER_HIT_BY_THROUGHPUT_THRESHOLD_TOTAL)
- .increment(hitByThrougputThreshold);
+
hitByThroughputThresholdCounter.increment(hitByThrougputThreshold);
}
}
}
@@ -414,8 +436,7 @@ public class Fetcher extends NutchTool implements Tool {
if (!feeder.isAlive()) {
int hitByTimeLimit = fetchQueues.checkTimelimit();
if (hitByTimeLimit != 0)
- innerContext.getCounter(NutchMetrics.GROUP_FETCHER,
-
NutchMetrics.FETCHER_HIT_BY_TIMELIMIT_TOTAL).increment(hitByTimeLimit);
+ hitByTimelimitCounter.increment(hitByTimeLimit);
}
/*
@@ -431,8 +452,7 @@ public class Fetcher extends NutchTool implements Tool {
timeout);
LOG.warn("Aborting with {} hung threads{}.", activeThreads,
feeder.isAlive() ? " (queue feeder still alive)" : "");
- innerContext.getCounter(NutchMetrics.GROUP_FETCHER,
-
NutchMetrics.FETCHER_HUNG_THREADS_TOTAL).increment(activeThreads.get());
+ hungThreadsCounter.increment(activeThreads.get());
for (int i = 0; i < fetcherThreads.size(); i++) {
FetcherThread thread = fetcherThreads.get(i);
if (thread.isAlive()) {
@@ -467,8 +487,7 @@ public class Fetcher extends NutchTool implements Tool {
fetchQueues.getTotalSize(), fetchQueues.getQueueCount(),
feeder.isAlive() ? " (queue feeder still alive)" : "");
int hitByTimeout = fetchQueues.emptyQueues();
- innerContext.getCounter(NutchMetrics.GROUP_FETCHER,
-
NutchMetrics.FETCHER_HIT_BY_TIMEOUT_TOTAL).increment(hitByTimeout);
+ hitByTimeoutCounter.increment(hitByTimeout);
return;
}
diff --git a/src/java/org/apache/nutch/hostdb/ResolverThread.java
b/src/java/org/apache/nutch/hostdb/ResolverThread.java
index 4c42c02b4..05e4a940c 100644
--- a/src/java/org/apache/nutch/hostdb/ResolverThread.java
+++ b/src/java/org/apache/nutch/hostdb/ResolverThread.java
@@ -21,6 +21,7 @@ import java.net.InetAddress;
import java.net.UnknownHostException;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.util.StringUtils;
@@ -44,6 +45,17 @@ public class ResolverThread implements Runnable {
protected Context context;
protected int purgeFailedHostsThreshold;
+ // Cached counter references for performance
+ private Counter newKnownHostCounter;
+ private Counter rediscoveredHostCounter;
+ private Counter existingKnownHostCounter;
+ private Counter newUnknownHostCounter;
+ private Counter existingUnknownHostCounter;
+ private Counter purgedUnknownHostCounter;
+ private Counter checkedHostsCounter;
+ private Counter errorsCounter;
+ private Counter errorsNetworkCounter;
+
/**
* Overloaded constructor.
* @param host name of the host to lookup
@@ -61,6 +73,33 @@ public class ResolverThread implements Runnable {
this.datum = datum;
this.context = context;
this.purgeFailedHostsThreshold = purgeFailedHostsThreshold;
+
+ // Initialize cached counters for performance
+ initCounters();
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups.
+ */
+ private void initCounters() {
+ newKnownHostCounter = context.getCounter(
+ NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_NEW_KNOWN_HOST_TOTAL);
+ rediscoveredHostCounter = context.getCounter(
+ NutchMetrics.GROUP_HOSTDB,
NutchMetrics.HOSTDB_REDISCOVERED_HOST_TOTAL);
+ existingKnownHostCounter = context.getCounter(
+ NutchMetrics.GROUP_HOSTDB,
NutchMetrics.HOSTDB_EXISTING_KNOWN_HOST_TOTAL);
+ newUnknownHostCounter = context.getCounter(
+ NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_NEW_UNKNOWN_HOST_TOTAL);
+ existingUnknownHostCounter = context.getCounter(
+ NutchMetrics.GROUP_HOSTDB,
NutchMetrics.HOSTDB_EXISTING_UNKNOWN_HOST_TOTAL);
+ purgedUnknownHostCounter = context.getCounter(
+ NutchMetrics.GROUP_HOSTDB,
NutchMetrics.HOSTDB_PURGED_UNKNOWN_HOST_TOTAL);
+ checkedHostsCounter = context.getCounter(
+ NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_CHECKED_HOSTS_TOTAL);
+ errorsCounter = context.getCounter(
+ NutchMetrics.GROUP_HOSTDB, NutchMetrics.ERROR_TOTAL);
+ errorsNetworkCounter = context.getCounter(
+ NutchMetrics.GROUP_HOSTDB, NutchMetrics.ERROR_NETWORK_TOTAL);
}
/**
@@ -75,19 +114,16 @@ public class ResolverThread implements Runnable {
InetAddress inetAddr = InetAddress.getByName(host);
if (datum.isEmpty()) {
- context.getCounter(NutchMetrics.GROUP_HOSTDB,
- NutchMetrics.HOSTDB_NEW_KNOWN_HOST_TOTAL).increment(1);
+ newKnownHostCounter.increment(1);
datum.setLastCheck();
LOG.info("{}: new_known_host {}", host, datum);
} else if (datum.getDnsFailures() > 0) {
- context.getCounter(NutchMetrics.GROUP_HOSTDB,
- NutchMetrics.HOSTDB_REDISCOVERED_HOST_TOTAL).increment(1);
+ rediscoveredHostCounter.increment(1);
datum.setLastCheck();
datum.setDnsFailures(0l);
LOG.info("{}: rediscovered_host {}", host, datum);
} else {
- context.getCounter(NutchMetrics.GROUP_HOSTDB,
- NutchMetrics.HOSTDB_EXISTING_KNOWN_HOST_TOTAL).increment(1);
+ existingKnownHostCounter.increment(1);
datum.setLastCheck();
LOG.info("{}: existing_known_host {}", host, datum);
}
@@ -101,8 +137,7 @@ public class ResolverThread implements Runnable {
datum.setLastCheck();
datum.setDnsFailures(1l);
context.write(hostText, datum);
- context.getCounter(NutchMetrics.GROUP_HOSTDB,
- NutchMetrics.HOSTDB_NEW_UNKNOWN_HOST_TOTAL).increment(1);
+ newUnknownHostCounter.increment(1);
LOG.info("{}: new_unknown_host {}", host, datum);
} else {
datum.setLastCheck();
@@ -113,12 +148,10 @@ public class ResolverThread implements Runnable {
purgeFailedHostsThreshold < datum.getDnsFailures()) {
context.write(hostText, datum);
- context.getCounter(NutchMetrics.GROUP_HOSTDB,
- NutchMetrics.HOSTDB_EXISTING_UNKNOWN_HOST_TOTAL).increment(1);
+ existingUnknownHostCounter.increment(1);
LOG.info("{}: existing_unknown_host {}", host, datum);
} else {
- context.getCounter(NutchMetrics.GROUP_HOSTDB,
- NutchMetrics.HOSTDB_PURGED_UNKNOWN_HOST_TOTAL).increment(1);
+ purgedUnknownHostCounter.increment(1);
LOG.info("{}: purged_unknown_host {}", host, datum);
}
}
@@ -126,10 +159,8 @@ public class ResolverThread implements Runnable {
// Dynamic counter based on failure count - can't cache
context.getCounter(NutchMetrics.GROUP_HOSTDB,
createFailureCounterLabel(datum)).increment(1);
// Common error counters for consistency
- context.getCounter(NutchMetrics.GROUP_HOSTDB,
- NutchMetrics.ERROR_TOTAL).increment(1);
- context.getCounter(NutchMetrics.GROUP_HOSTDB,
- NutchMetrics.ERROR_NETWORK_TOTAL).increment(1);
+ errorsCounter.increment(1);
+ errorsNetworkCounter.increment(1);
} catch (Exception ioe) {
LOG.warn(StringUtils.stringifyException(ioe));
context.getCounter(NutchMetrics.GROUP_HOSTDB,
@@ -139,14 +170,12 @@ public class ResolverThread implements Runnable {
}
} catch (Exception e) {
LOG.warn(StringUtils.stringifyException(e));
- context.getCounter(NutchMetrics.GROUP_HOSTDB,
- NutchMetrics.ERROR_TOTAL).increment(1);
+ errorsCounter.increment(1);
context.getCounter(NutchMetrics.GROUP_HOSTDB,
ErrorTracker.getCounterName(e)).increment(1);
}
- context.getCounter(NutchMetrics.GROUP_HOSTDB,
- NutchMetrics.HOSTDB_CHECKED_HOSTS_TOTAL).increment(1);
+ checkedHostsCounter.increment(1);
}
private String createFailureCounterLabel(HostDatum datum) {
diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java
b/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java
index 10a08d55a..b1736348b 100644
--- a/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java
+++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java
@@ -80,12 +80,19 @@ public class UpdateHostDbMapper
normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT);
// Initialize cached counter references
- filteredRecordsCounter = context.getCounter(
- NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_FILTERED_RECORDS_TOTAL);
+ initCounters(context);
// Initialize error tracker with cached counters
errorTracker = new ErrorTracker(NutchMetrics.GROUP_HOSTDB, context);
}
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot
paths.
+ */
+ private void initCounters(Context context) {
+ filteredRecordsCounter = context.getCounter(
+ NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_FILTERED_RECORDS_TOTAL);
+ }
+
/**
* Filters and or normalizes the input hostname by applying the configured
URL
* filters and normalizers the URL "http://hostname/".
diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
index 6c979f222..878216b3c 100644
--- a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
+++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
@@ -154,6 +154,13 @@ public class UpdateHostDbReducer
}
// Initialize cached counter references
+ initCounters(context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot
paths.
+ */
+ private void initCounters(Reducer<Text, NutchWritable, Text,
HostDatum>.Context context) {
urlLimitNotReachedCounter = context.getCounter(
NutchMetrics.GROUP_HOSTDB,
NutchMetrics.HOSTDB_URL_LIMIT_NOT_REACHED_TOTAL);
totalHostsCounter = context.getCounter(
diff --git a/src/java/org/apache/nutch/indexer/CleaningJob.java
b/src/java/org/apache/nutch/indexer/CleaningJob.java
index ae01e4b0d..dc466dad0 100644
--- a/src/java/org/apache/nutch/indexer/CleaningJob.java
+++ b/src/java/org/apache/nutch/indexer/CleaningJob.java
@@ -26,6 +26,7 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
@@ -89,6 +90,9 @@ public class CleaningJob implements Tool {
IndexWriters writers = null;
+ // Cached counter reference for performance
+ private Counter deletedDocumentsCounter;
+
@Override
public void setup(Reducer<ByteWritable, Text, Text, ByteWritable>.Context
context) {
Configuration conf = context.getConfiguration();
@@ -99,6 +103,17 @@ public class CleaningJob implements Tool {
throw new RuntimeException(e);
}
noCommit = conf.getBoolean("noCommit", false);
+
+ // Initialize cached counter reference
+ initCounters(context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot
paths.
+ */
+ private void initCounters(Context context) {
+ deletedDocumentsCounter = context.getCounter(
+ NutchMetrics.GROUP_CLEANING,
NutchMetrics.CLEANING_DELETED_DOCUMENTS_TOTAL);
}
@Override
@@ -119,8 +134,7 @@ public class CleaningJob implements Tool {
for (Text document : values) {
writers.delete(document.toString());
totalDeleted++;
- context.getCounter(NutchMetrics.GROUP_CLEANING,
- NutchMetrics.CLEANING_DELETED_DOCUMENTS_TOTAL).increment(1);
+ deletedDocumentsCounter.increment(1);
}
}
}
diff --git a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
index 0b728a588..fee0921d0 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
@@ -48,6 +48,7 @@ import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
import org.apache.hadoop.mapreduce.Mapper;
@@ -328,6 +329,10 @@ public class WebGraph extends Configured implements Tool {
// url normalizers, filters and job configuration
private Configuration conf;
+ // Cached counter references for performance
+ private Counter addedLinksCounter;
+ private Counter removedLinksCounter;
+
/**
* Configures the OutlinkDb job reducer. Sets up internal links and link
limiting.
*/
@@ -340,6 +345,18 @@ public class WebGraph extends Configured implements Tool {
limitPages = conf.getBoolean("link.ignore.limit.page", true);
limitDomains = conf.getBoolean("link.ignore.limit.domain", true);
+ // Initialize cached counter references
+ initCounters(context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot
paths.
+ */
+ private void initCounters(Context context) {
+ addedLinksCounter = context.getCounter(
+ NutchMetrics.GROUP_WEBGRAPH,
NutchMetrics.WEBGRAPH_ADDED_LINKS_TOTAL);
+ removedLinksCounter = context.getCounter(
+ NutchMetrics.GROUP_WEBGRAPH,
NutchMetrics.WEBGRAPH_REMOVED_LINKS_TOTAL);
}
@Override
@@ -362,16 +379,14 @@ public class WebGraph extends Configured implements Tool {
mostRecent = timestamp;
}
outlinkList.add(WritableUtils.clone(next, conf));
- context.getCounter(NutchMetrics.GROUP_WEBGRAPH,
- NutchMetrics.WEBGRAPH_ADDED_LINKS_TOTAL).increment(1);
+ addedLinksCounter.increment(1);
} else if (value instanceof BooleanWritable) {
BooleanWritable delete = (BooleanWritable) value;
// Actually, delete is always true, otherwise we don't emit it in
the
// mapper in the first place
if (delete.get() == true) {
// This page is gone, do not emit it's outlinks
- context.getCounter(NutchMetrics.GROUP_WEBGRAPH,
- NutchMetrics.WEBGRAPH_REMOVED_LINKS_TOTAL).increment(1);
+ removedLinksCounter.increment(1);
return;
}
}
diff --git a/src/java/org/apache/nutch/tools/warc/WARCExporter.java
b/src/java/org/apache/nutch/tools/warc/WARCExporter.java
index f271adfe9..14b59ac85 100644
--- a/src/java/org/apache/nutch/tools/warc/WARCExporter.java
+++ b/src/java/org/apache/nutch/tools/warc/WARCExporter.java
@@ -124,6 +124,15 @@ public class WARCExporter extends Configured implements
Tool {
@Override
public void setup(Context context) {
// Initialize cached counter references
+ initCounters(context);
+ // Initialize error tracker with cached counters
+ errorTracker = new ErrorTracker(NutchMetrics.GROUP_WARC_EXPORTER,
context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot
paths.
+ */
+ private void initCounters(Context context) {
missingContentCounter = context.getCounter(
NutchMetrics.GROUP_WARC_EXPORTER,
NutchMetrics.WARC_MISSING_CONTENT_TOTAL);
missingMetadataCounter = context.getCounter(
@@ -132,8 +141,6 @@ public class WARCExporter extends Configured implements
Tool {
NutchMetrics.GROUP_WARC_EXPORTER,
NutchMetrics.WARC_OMITTED_EMPTY_RESPONSE_TOTAL);
recordsGeneratedCounter = context.getCounter(
NutchMetrics.GROUP_WARC_EXPORTER,
NutchMetrics.WARC_RECORDS_GENERATED_TOTAL);
- // Initialize error tracker with cached counters
- errorTracker = new ErrorTracker(NutchMetrics.GROUP_WARC_EXPORTER,
context);
}
@Override
diff --git a/src/java/org/apache/nutch/util/DomainStatistics.java
b/src/java/org/apache/nutch/util/DomainStatistics.java
index 5ee09c846..4057795d5 100644
--- a/src/java/org/apache/nutch/util/DomainStatistics.java
+++ b/src/java/org/apache/nutch/util/DomainStatistics.java
@@ -28,6 +28,7 @@ import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
@@ -38,6 +39,7 @@ import
org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metrics.NutchMetrics;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -52,10 +54,6 @@ public class DomainStatistics extends Configured implements
Tool {
private static final Text FETCHED_TEXT = new Text("FETCHED");
private static final Text NOT_FETCHED_TEXT = new Text("NOT_FETCHED");
- public static enum MyCounter {
- FETCHED, NOT_FETCHED, EMPTY_RESULT
- };
-
private static final int MODE_HOST = 1;
private static final int MODE_DOMAIN = 2;
private static final int MODE_SUFFIX = 3;
@@ -158,10 +156,29 @@ public class DomainStatistics extends Configured
implements Tool {
Mapper<Text, CrawlDatum, Text, LongWritable> {
int mode = 0;
+ // Cached counter references for performance
+ private Counter fetchedCounter;
+ private Counter notFetchedCounter;
+ private Counter emptyResultCounter;
+
@Override
public void setup(Context context) {
mode = context.getConfiguration().getInt("domain.statistics.mode",
MODE_DOMAIN);
+ // Initialize cached counter references
+ initCounters(context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot
paths.
+ */
+ private void initCounters(Context context) {
+ fetchedCounter = context.getCounter(
+ NutchMetrics.GROUP_DOMAIN_STATS,
NutchMetrics.DOMAIN_STATS_FETCHED_TOTAL);
+ notFetchedCounter = context.getCounter(
+ NutchMetrics.GROUP_DOMAIN_STATS,
NutchMetrics.DOMAIN_STATS_NOT_FETCHED_TOTAL);
+ emptyResultCounter = context.getCounter(
+ NutchMetrics.GROUP_DOMAIN_STATS,
NutchMetrics.DOMAIN_STATS_EMPTY_RESULT_TOTAL);
}
@Override
@@ -197,17 +214,17 @@ public class DomainStatistics extends Configured
implements Tool {
}
if (out.trim().equals("")) {
LOG.info("url : {}", url);
- context.getCounter(MyCounter.EMPTY_RESULT).increment(1);
+ emptyResultCounter.increment(1);
}
context.write(new Text(out), new LongWritable(1));
} catch (Exception ex) {
}
- context.getCounter(MyCounter.FETCHED).increment(1);
+ fetchedCounter.increment(1);
context.write(FETCHED_TEXT, new LongWritable(1));
} else {
- context.getCounter(MyCounter.NOT_FETCHED).increment(1);
+ notFetchedCounter.increment(1);
context.write(NOT_FETCHED_TEXT, new LongWritable(1));
}
}
diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java
b/src/java/org/apache/nutch/util/SitemapProcessor.java
index 4b55a72eb..21362223c 100644
--- a/src/java/org/apache/nutch/util/SitemapProcessor.java
+++ b/src/java/org/apache/nutch/util/SitemapProcessor.java
@@ -151,6 +151,15 @@ public class SitemapProcessor extends Configured
implements Tool {
}
// Initialize cached counter references
+ initCounters(context);
+ // Initialize error tracker with cached counters
+ errorTracker = new ErrorTracker(NutchMetrics.GROUP_SITEMAP, context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot
paths.
+ */
+ private void initCounters(Context context) {
filteredRecordsCounter = context.getCounter(
NutchMetrics.GROUP_SITEMAP,
NutchMetrics.SITEMAP_FILTERED_RECORDS_TOTAL);
seedsCounter = context.getCounter(
@@ -161,8 +170,6 @@ public class SitemapProcessor extends Configured implements
Tool {
NutchMetrics.GROUP_SITEMAP,
NutchMetrics.SITEMAP_FILTERED_FROM_HOSTNAME_TOTAL);
failedFetchesCounter = context.getCounter(
NutchMetrics.GROUP_SITEMAP,
NutchMetrics.SITEMAP_FAILED_FETCHES_TOTAL);
- // Initialize error tracker with cached counters
- errorTracker = new ErrorTracker(NutchMetrics.GROUP_SITEMAP, context);
}
@Override
@@ -377,6 +384,13 @@ public class SitemapProcessor extends Configured
implements Tool {
this.overwriteExisting = conf.getBoolean(SITEMAP_OVERWRITE_EXISTING,
false);
// Initialize cached counter references
+ initCounters(context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot
paths.
+ */
+ private void initCounters(Context context) {
existingEntriesCounter = context.getCounter(
NutchMetrics.GROUP_SITEMAP,
NutchMetrics.SITEMAP_EXISTING_ENTRIES_TOTAL);
newEntriesCounter = context.getCounter(