This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 36c2ce6 NUTCH-2598 URLNormalizerChecker fails on invalid URLs in
input - output empty string for invalid URLs (MalformdURLException thrown) or
if normalizer(s) return null
new 9dd11cd Merge pull request #435 from
sebastian-nagel/NUTCH-2598-normalizerchecker-fails-on-invalid-url
36c2ce6 is described below
commit 36c2ce6925a7fc4e4c3f5b6052ee96d491d2e5eb
Author: Sebastian Nagel <[email protected]>
AuthorDate: Tue Jan 22 17:04:16 2019 +0100
NUTCH-2598 URLNormalizerChecker fails on invalid URLs in input
- output empty string for invalid URLs (MalformdURLException thrown)
or if normalizer(s) return null
---
src/java/org/apache/nutch/net/URLNormalizerChecker.java | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/src/java/org/apache/nutch/net/URLNormalizerChecker.java
b/src/java/org/apache/nutch/net/URLNormalizerChecker.java
index 2805f85..ee25f2f 100644
--- a/src/java/org/apache/nutch/net/URLNormalizerChecker.java
+++ b/src/java/org/apache/nutch/net/URLNormalizerChecker.java
@@ -16,6 +16,8 @@
*/
package org.apache.nutch.net;
+import java.net.MalformedURLException;
+
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.AbstractChecker;
@@ -35,7 +37,8 @@ public class URLNormalizerChecker extends AbstractChecker {
+ "\n \t(if not given all configured URL normalizers are
applied)"
+ "\n -scope \tone of:
default,partition,generate_host_count,fetcher,crawldb,linkdb,inject,outlink"
+ "\n -stdin \ttool reads a list of URLs from stdin, one URL per
line"
- + "\n -listen <port>\trun tool as Telnet server listening on
<port>\n";
+ + "\n -listen <port>\trun tool as Telnet server listening on <port>"
+ + "\n\nAn empty line is added to the output if a URL fails to
normalize (MalformedURLException or null returned).\n";
// Print help when no args given
if (args.length < 1) {
@@ -71,7 +74,16 @@ public class URLNormalizerChecker extends AbstractChecker {
}
protected int process(String line, StringBuilder output) throws Exception {
- output.append(normalizers.normalize(line, scope));
+ try {
+ String norm = normalizers.normalize(line, scope);
+ if (norm == null) {
+ output.append("");
+ } else {
+ output.append(norm);
+ }
+ } catch (MalformedURLException e) {
+ output.append("");
+ }
return 0;
}