Repository: nutch Updated Branches: refs/heads/master 836b2e01d -> d4c924e56
revert 2320 Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/d4c924e5 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/d4c924e5 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/d4c924e5 Branch: refs/heads/master Commit: d4c924e56030d6b1fa3b115686e80c8cf516db61 Parents: 836b2e0 Author: Markus Jelsma <mar...@apache.org> Authored: Thu Oct 6 10:56:50 2016 +0200 Committer: Markus Jelsma <mar...@apache.org> Committed: Thu Oct 6 10:56:50 2016 +0200 ---------------------------------------------------------------------- .../org/apache/nutch/net/URLFilterChecker.java | 181 ++++++------------- 1 file changed, 59 insertions(+), 122 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/d4c924e5/src/java/org/apache/nutch/net/URLFilterChecker.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/net/URLFilterChecker.java b/src/java/org/apache/nutch/net/URLFilterChecker.java index 86b91e2..89a3d00 100644 --- a/src/java/org/apache/nutch/net/URLFilterChecker.java +++ b/src/java/org/apache/nutch/net/URLFilterChecker.java @@ -17,27 +17,16 @@ package org.apache.nutch.net; -import java.io.BufferedReader; -import java.io.InputStreamReader; -import java.io.PrintWriter; -import java.net.ServerSocket; -import java.net.Socket; -import java.net.InetSocketAddress; -import java.nio.charset.Charset; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; - import org.apache.nutch.plugin.Extension; import org.apache.nutch.plugin.ExtensionPoint; import org.apache.nutch.plugin.PluginRepository; -import org.apache.nutch.util.NutchConfiguration; import org.apache.hadoop.conf.Configuration; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.nutch.util.NutchConfiguration; + +import java.io.BufferedReader; +import java.io.InputStreamReader; /** * Checks one given filter or all filters. @@ -47,118 +36,62 @@ import org.slf4j.LoggerFactory; public class URLFilterChecker { private Configuration conf; - private static String filterName = null; - protected static boolean keepClientCnxOpen = false; - protected static int tcpPort = -1; - protected URLFilters filters = null; - - public static final Logger LOG = LoggerFactory - .getLogger(URLFilterChecker.class); - + public URLFilterChecker(Configuration conf) { - System.out.println("Checking combination of all URLFilters available"); this.conf = conf; - if (filterName != null) { - this.conf.set("plugin.includes", filterName); - } - filters = new URLFilters(this.conf); } - - public void run() throws Exception { - // In listening mode? - if (tcpPort == -1) { - // No, just fetch and display - checkStdin(); - } else { - // Listen on socket and start workers on incoming requests - listen(); - } - } - - private void listen() throws Exception { - ServerSocket server = null; - - try{ - server = new ServerSocket(); - server.bind(new InetSocketAddress(tcpPort)); - LOG.info(server.toString()); - } catch (Exception e) { - LOG.error("Could not listen on port " + tcpPort); - System.exit(-1); - } - - while(true){ - Worker worker; - try{ - worker = new Worker(server.accept()); - Thread thread = new Thread(worker); - thread.start(); - } catch (Exception e) { - LOG.error("Accept failed: " + tcpPort); - System.exit(-1); - } - } - } - - private class Worker implements Runnable { - private Socket client; - Worker(Socket client) { - this.client = client; - LOG.info(client.toString()); - } + private void checkOne(String filterName) throws Exception { + URLFilter filter = null; + + ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint( + URLFilter.X_POINT_ID); + + if (point == null) + throw new RuntimeException(URLFilter.X_POINT_ID + " not found."); - public void run() { - if (keepClientCnxOpen) { - while (true) { // keep connection open until closes - readWrite(); - } + Extension[] extensions = point.getExtensions(); + + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + filter = (URLFilter) extension.getExtensionInstance(); + if (filter.getClass().getName().equals(filterName)) { + break; } else { - readWrite(); - - try { // close ourselves - client.close(); - } catch (Exception e){ - LOG.error(e.toString()); - } + filter = null; } } - - protected void readWrite() { - String line; - BufferedReader in = null; - PrintWriter out = null; - - try{ - in = new BufferedReader(new InputStreamReader(client.getInputStream())); - } catch (Exception e) { - LOG.error("in or out failed"); - System.exit(-1); - } - try{ - line = in.readLine(); - - String result = filters.filter(line); - String output; - if (result != null) { - output = "+" + result + "\n"; - } else { - output = "-" + line + "\n";; - } - - client.getOutputStream().write(output.getBytes(Charset.forName("UTF-8"))); - }catch (Exception e) { - LOG.error("Read/Write failed: " + e); + if (filter == null) + throw new RuntimeException("Filter " + filterName + " not found."); + + // jerome : should we keep this behavior? + // if (LogFormatter.hasLoggedSevere()) + // throw new RuntimeException("Severe error encountered."); + + System.out.println("Checking URLFilter " + filterName); + + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + String line; + while ((line = in.readLine()) != null) { + String out = filter.filter(line); + if (out != null) { + System.out.print("+"); + System.out.println(out); + } else { + System.out.print("-"); + System.out.println(line); } } } - private void checkStdin() throws Exception { + private void checkAll() throws Exception { + System.out.println("Checking combination of all URLFilters available"); + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); String line; - while ((line = in.readLine()) != null) { + URLFilters filters = new URLFilters(this.conf); String out = filters.filter(line); if (out != null) { System.out.print("+"); @@ -172,26 +105,30 @@ public class URLFilterChecker { public static void main(String[] args) throws Exception { - String usage = "Usage: URLFilterChecker (-filterName filterName | -allCombined) [-listen <port>] [-keepClientCnxOpen]) \n" + String usage = "Usage: URLFilterChecker (-filterName filterName | -allCombined) \n" + "Tool takes a list of URLs, one per line, passed via STDIN.\n"; - if (args.length < 1) { + if (args.length == 0) { System.err.println(usage); System.exit(-1); } - for (int i = 0; i < args.length; i++) { - if (args[i].equals("-filterName")) { - filterName = args[++i]; - } else if (args[i].equals("-listen")) { - tcpPort = Integer.parseInt(args[++i]); - } else if (args[i].equals("-keepClientCnxOpen")) { - keepClientCnxOpen = true; + String filterName = null; + if (args[0].equals("-filterName")) { + if (args.length != 2) { + System.err.println(usage); + System.exit(-1); } + filterName = args[1]; } - + URLFilterChecker checker = new URLFilterChecker(NutchConfiguration.create()); - checker.run(); + if (filterName != null) { + checker.checkOne(filterName); + } else { + checker.checkAll(); + } + System.exit(0); } }