This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 5f1330a03 NUTCH-3043 Generator: count URLs rejected by URL filters
(#814)
5f1330a03 is described below
commit 5f1330a03d136440a167a85da6cfe8ac4b3f61b9
Author: Sebastian Nagel <[email protected]>
AuthorDate: Tue May 14 17:38:25 2024 +0200
NUTCH-3043 Generator: count URLs rejected by URL filters (#814)
- add counters URL_FILTERS_REJECTED and URL_FILTER_EXCEPTION
- simplify logging statement
- remove unnecessary cast
- use parameterized logging
---
src/java/org/apache/nutch/crawl/Generator.java | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/src/java/org/apache/nutch/crawl/Generator.java
b/src/java/org/apache/nutch/crawl/Generator.java
index 33f743a37..f57642a65 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -224,9 +224,12 @@ public class Generator extends NutchTool implements Tool {
// If filtering is on don't generate URLs that don't pass
// URLFilters
try {
- if (filters.filter(url.toString()) == null)
+ if (filters.filter(url.toString()) == null) {
+ context.getCounter("Generator",
"URL_FILTERS_REJECTED").increment(1);
return;
+ }
} catch (URLFilterException e) {
+ context.getCounter("Generator", "URL_FILTER_EXCEPTION").increment(1);
LOG.warn("Couldn't filter url: {} ({})", url, e.getMessage());
}
}
@@ -253,10 +256,7 @@ public class Generator extends NutchTool implements Tool {
try {
sort = scfilters.generatorSortValue(key, crawlDatum, sort);
} catch (ScoringFilterException sfe) {
- if (LOG.isWarnEnabled()) {
- LOG.warn(
- "Couldn't filter generatorSortValue for " + key + ": " + sfe);
- }
+ LOG.warn("Couldn't filter generatorSortValue for {}: {}", key, sfe);
}
// check expr
@@ -625,7 +625,7 @@ public class Generator extends NutchTool implements Tool {
// make later bytes more significant in hash code, so that sorting
// by hashcode correlates less with by-host ordering.
for (int i = length - 1; i >= 0; i--)
- hash = (31 * hash) + (int) bytes[start + i];
+ hash = (31 * hash) + bytes[start + i];
return hash;
}
}