This is an automated email from the ASF dual-hosted git repository. lewismc pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new 02dca3b6d NUTCH-2936 Early registration of URL stream handlers provided by plugins may fail Hadoop jobs running in distributed mode (#726) 02dca3b6d is described below commit 02dca3b6d097af0f8fa76ce17f0a33267964bf19 Author: Lewis John McGibbney <lewis.mcgibb...@gmail.com> AuthorDate: Fri May 20 11:04:22 2022 -0700 NUTCH-2936 Early registration of URL stream handlers provided by plugins may fail Hadoop jobs running in distributed mode (#726) * NUTCH-2936 Early registration of URL stream handlers provided by plugins may fail Hadoop jobs running in distributed mode --- src/java/org/apache/nutch/parse/ParserChecker.java | 48 +++---- src/java/org/apache/nutch/plugin/Extension.java | 12 +- .../org/apache/nutch/plugin/ExtensionPoint.java | 18 +-- .../apache/nutch/plugin/PluginManifestParser.java | 3 +- .../org/apache/nutch/plugin/PluginRepository.java | 57 +++++---- .../nutch/plugin/URLStreamHandlerFactory.java | 13 +- .../apache/nutch/protocol/http/api/HttpBase.java | 140 +++++++++++---------- src/plugin/protocol-foo/plugin.xml | 2 +- src/plugin/protocol-okhttp/ivy.xml | 4 +- src/plugin/protocol-okhttp/plugin.xml | 12 +- .../org/apache/nutch/protocol/okhttp/OkHttp.java | 48 +++---- .../nutch/protocol/okhttp/OkHttpResponse.java | 26 ++-- 12 files changed, 195 insertions(+), 188 deletions(-) diff --git a/src/java/org/apache/nutch/parse/ParserChecker.java b/src/java/org/apache/nutch/parse/ParserChecker.java index 6c82a516b..5da023fdc 100644 --- a/src/java/org/apache/nutch/parse/ParserChecker.java +++ b/src/java/org/apache/nutch/parse/ParserChecker.java @@ -114,15 +114,15 @@ public class ParserChecker extends AbstractChecker { int numConsumed; for (int i = 0; i < args.length; i++) { if (args[i].equals("-normalize")) { - normalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT); + this.normalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT); } else if (args[i].equals("-followRedirects")) { - followRedirects = true; + this.followRedirects = true; } else if (args[i].equals("-checkRobotsTxt")) { - checkRobotsTxt = true; + this.checkRobotsTxt = true; } else if (args[i].equals("-forceAs")) { - forceAsContentType = args[++i]; + this.forceAsContentType = args[++i]; } else if (args[i].equals("-dumpText")) { - dumpText = true; + this.dumpText = true; } else if (args[i].equals("-md")) { String k = null, v = null; String nextOne = args[++i]; @@ -132,7 +132,7 @@ public class ParserChecker extends AbstractChecker { v = nextOne.substring(firstEquals + 1); } else k = nextOne; - metadata.put(k, v); + this.metadata.put(k, v); } else if ((numConsumed = super.parseArgs(args, i)) > 0) { i += numConsumed - 1; } else if (i != args.length - 1) { @@ -144,7 +144,7 @@ public class ParserChecker extends AbstractChecker { } } - scfilters = new ScoringFilters(getConf()); + this.scfilters = new ScoringFilters(getConf()); if (url != null) { return super.processSingle(url); @@ -155,25 +155,25 @@ public class ParserChecker extends AbstractChecker { } protected int process(String url, StringBuilder output) throws Exception { - if (normalizers != null) { - url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT); + if (this.normalizers != null) { + url = this.normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT); } LOG.info("fetching: " + url); CrawlDatum datum = new CrawlDatum(); - Iterator<String> iter = metadata.keySet().iterator(); + Iterator<String> iter = this.metadata.keySet().iterator(); while (iter.hasNext()) { String key = iter.next(); - String value = metadata.get(key); + String value = this.metadata.get(key); if (value == null) value = ""; datum.getMetaData().put(new Text(key), new Text(value)); } int maxRedirects = getConf().getInt("http.redirect.max", 3); - if (followRedirects) { + if (this.followRedirects) { if (maxRedirects == 0) { LOG.info("Following max. 3 redirects (ignored http.redirect.max == 0)"); maxRedirects = 3; @@ -183,30 +183,30 @@ public class ParserChecker extends AbstractChecker { } ProtocolOutput protocolOutput = getProtocolOutput(url, datum, - checkRobotsTxt); + this.checkRobotsTxt); Text turl = new Text(url); // Following redirects and not reached maxRedirects? int numRedirects = 0; while (protocolOutput != null && !protocolOutput.getStatus().isSuccess() - && followRedirects && protocolOutput.getStatus().isRedirect() + && this.followRedirects && protocolOutput.getStatus().isRedirect() && maxRedirects >= numRedirects) { String[] stuff = protocolOutput.getStatus().getArgs(); url = stuff[0]; LOG.info("Follow redirect to {}", url); - if (normalizers != null) { - url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT); + if (this.normalizers != null) { + url = this.normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT); } turl.set(url); // try again - protocolOutput = getProtocolOutput(url, datum, checkRobotsTxt); + protocolOutput = getProtocolOutput(url, datum, this.checkRobotsTxt); numRedirects++; } - if (checkRobotsTxt && protocolOutput == null) { + if (this.checkRobotsTxt && protocolOutput == null) { System.err.println("Fetch disallowed by robots.txt"); return -1; } @@ -231,9 +231,9 @@ public class ParserChecker extends AbstractChecker { } String contentType; - if (forceAsContentType != null) { - content.setContentType(forceAsContentType); - contentType = forceAsContentType; + if (this.forceAsContentType != null) { + content.setContentType(this.forceAsContentType); + contentType = this.forceAsContentType; } else { contentType = content.getContentType(); } @@ -253,7 +253,7 @@ public class ParserChecker extends AbstractChecker { // call the scoring filters try { - scfilters.passScoreBeforeParsing(turl, datum, content); + this.scfilters.passScoreBeforeParsing(turl, datum, content); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't pass score before parsing, url " + turl + " (" + e @@ -284,7 +284,7 @@ public class ParserChecker extends AbstractChecker { Parse parse = entry.getValue(); // call the scoring filters try { - scfilters.passScoreAfterParsing(turl, content, parse); + this.scfilters.passScoreAfterParsing(turl, content, parse); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't pass score after parsing, url " + turl + " (" + e @@ -295,7 +295,7 @@ public class ParserChecker extends AbstractChecker { output.append(turl).append("\n"); output.append(parse.getData()).append("\n"); - if (dumpText) { + if (this.dumpText) { output.append(parse.getText()); } } diff --git a/src/java/org/apache/nutch/plugin/Extension.java b/src/java/org/apache/nutch/plugin/Extension.java index ab65bb82f..246e8ff7b 100644 --- a/src/java/org/apache/nutch/plugin/Extension.java +++ b/src/java/org/apache/nutch/plugin/Extension.java @@ -140,13 +140,15 @@ public class Extension { } /** - * Return an instance of the extension implementatio. Before we create a + * Return an instance of the extension implementation. Before we create a * extension instance we startup the plugin if it is not already done. The * plugin instance and the extension instance use the same - * <code>PluginClassLoader</code>. Each Plugin use its own classloader. The - * PluginClassLoader knows only own <i>Plugin runtime libraries </i> setuped - * in the plugin manifest file and exported libraries of the depenedend - * plugins. + * {@link org.apache.nutch.plugin.PluginClassLoader}. + * Each Plugin use its own classloader. The + * {@link org.apache.nutch.plugin.PluginClassLoader} knows only its own + * <i>plugin runtime libraries</i> defined + * in the <code>plugin.xml</code> manifest file and exported libraries + * of the dependent plugins. * * @return Object An instance of the extension implementation * @throws PluginRuntimeException if there is a fatal runtime error diff --git a/src/java/org/apache/nutch/plugin/ExtensionPoint.java b/src/java/org/apache/nutch/plugin/ExtensionPoint.java index 58422046b..92ddfd815 100644 --- a/src/java/org/apache/nutch/plugin/ExtensionPoint.java +++ b/src/java/org/apache/nutch/plugin/ExtensionPoint.java @@ -44,7 +44,7 @@ public class ExtensionPoint { setId(pId); setName(pName); setSchema(pSchema); - fExtensions = new ArrayList<>(); + this.fExtensions = new ArrayList<>(); } /** @@ -53,7 +53,7 @@ public class ExtensionPoint { * @return String */ public String getId() { - return ftId; + return this.ftId; } /** @@ -62,7 +62,7 @@ public class ExtensionPoint { * @return String */ public String getName() { - return fName; + return this.fName; } /** @@ -71,7 +71,7 @@ public class ExtensionPoint { * @return String */ public String getSchema() { - return fSchema; + return this.fSchema; } /** @@ -81,7 +81,7 @@ public class ExtensionPoint { * extension point id */ private void setId(String pId) { - ftId = pId; + this.ftId = pId; } /** @@ -90,7 +90,7 @@ public class ExtensionPoint { * @param pName */ private void setName(String pName) { - fName = pName; + this.fName = pName; } /** @@ -99,7 +99,7 @@ public class ExtensionPoint { * @param pSchema */ private void setSchema(String pSchema) { - fSchema = pSchema; + this.fSchema = pSchema; } /** @@ -109,7 +109,7 @@ public class ExtensionPoint { * to install */ public void addExtension(Extension extension) { - fExtensions.add(extension); + this.fExtensions.add(extension); } /** @@ -118,7 +118,7 @@ public class ExtensionPoint { * @return Extension[] */ public Extension[] getExtensions() { - return fExtensions.toArray(new Extension[fExtensions.size()]); + return this.fExtensions.toArray(new Extension[this.fExtensions.size()]); } } diff --git a/src/java/org/apache/nutch/plugin/PluginManifestParser.java b/src/java/org/apache/nutch/plugin/PluginManifestParser.java index 4c845b430..ccae5827d 100644 --- a/src/java/org/apache/nutch/plugin/PluginManifestParser.java +++ b/src/java/org/apache/nutch/plugin/PluginManifestParser.java @@ -19,6 +19,7 @@ package org.apache.nutch.plugin; import java.io.File; import java.io.IOException; import java.io.UnsupportedEncodingException; +import java.lang.invoke.MethodHandles; import java.net.MalformedURLException; import java.net.URL; import java.net.URLDecoder; @@ -50,7 +51,7 @@ public class PluginManifestParser { private static final String ATTR_CLASS = "class"; private static final String ATTR_ID = "id"; - protected static final Logger LOG = LoggerFactory.getLogger(PluginManifestParser.class); + protected static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private static final boolean WINDOWS = System.getProperty("os.name").startsWith("Windows"); diff --git a/src/java/org/apache/nutch/plugin/PluginRepository.java b/src/java/org/apache/nutch/plugin/PluginRepository.java index 726da4566..3c554094b 100644 --- a/src/java/org/apache/nutch/plugin/PluginRepository.java +++ b/src/java/org/apache/nutch/plugin/PluginRepository.java @@ -64,6 +64,7 @@ public class PluginRepository implements URLStreamHandlerFactory { private HashMap<String, Plugin> fActivatedPlugins; + @SuppressWarnings("rawtypes") private static final Map<String, Map<PluginClassLoader, Class>> CLASS_CACHE = new HashMap<>(); private Configuration conf; @@ -75,8 +76,8 @@ public class PluginRepository implements URLStreamHandlerFactory { * @throws RuntimeException if a fatal runtime error is encountered */ public PluginRepository(Configuration conf) throws RuntimeException { - fActivatedPlugins = new HashMap<>(); - fExtensionPoints = new HashMap<>(); + this.fActivatedPlugins = new HashMap<>(); + this.fExtensionPoints = new HashMap<>(); this.conf = new Configuration(conf); this.auto = conf.getBoolean("plugin.auto-activation", true); String[] pluginFolders = conf.getStrings("plugin.folders"); @@ -92,11 +93,11 @@ public class PluginRepository implements URLStreamHandlerFactory { Pattern includes = Pattern.compile(conf.get("plugin.includes", "")); Map<String, PluginDescriptor> filteredPlugins = filter(excludes, includes, allPlugins); - fRegisteredPlugins = getDependencyCheckedPlugins(filteredPlugins, + this.fRegisteredPlugins = getDependencyCheckedPlugins(filteredPlugins, this.auto ? allPlugins : filteredPlugins); - installExtensionPoints(fRegisteredPlugins); + installExtensionPoints(this.fRegisteredPlugins); try { - installExtensions(fRegisteredPlugins); + installExtensions(this.fRegisteredPlugins); } catch (PluginRuntimeException e) { LOG.error("Could not install extensions.", e.toString()); throw new RuntimeException(e.getMessage()); @@ -134,7 +135,7 @@ public class PluginRepository implements URLStreamHandlerFactory { for (ExtensionPoint point : plugin.getExtenstionPoints()) { String xpId = point.getId(); LOG.debug("Adding extension point {}", xpId); - fExtensionPoints.put(xpId, point); + this.fExtensionPoints.put(xpId, point); } } } @@ -236,8 +237,8 @@ public class PluginRepository implements URLStreamHandlerFactory { * @return PluginDescriptor[] */ public PluginDescriptor[] getPluginDescriptors() { - return fRegisteredPlugins - .toArray(new PluginDescriptor[fRegisteredPlugins.size()]); + return this.fRegisteredPlugins + .toArray(new PluginDescriptor[this.fRegisteredPlugins.size()]); } /** @@ -248,7 +249,7 @@ public class PluginRepository implements URLStreamHandlerFactory { */ public PluginDescriptor getPluginDescriptor(String pPluginId) { - for (PluginDescriptor descriptor : fRegisteredPlugins) { + for (PluginDescriptor descriptor : this.fRegisteredPlugins) { if (descriptor.getPluginId().equals(pPluginId)) return descriptor; } @@ -282,8 +283,8 @@ public class PluginRepository implements URLStreamHandlerFactory { */ public Plugin getPluginInstance(PluginDescriptor pDescriptor) throws PluginRuntimeException { - if (fActivatedPlugins.containsKey(pDescriptor.getPluginId())) - return fActivatedPlugins.get(pDescriptor.getPluginId()); + if (this.fActivatedPlugins.containsKey(pDescriptor.getPluginId())) + return this.fActivatedPlugins.get(pDescriptor.getPluginId()); try { // Must synchronize here to make sure creation and initialization // of a plugin instance are done by one and only one thread. @@ -297,7 +298,7 @@ public class PluginRepository implements URLStreamHandlerFactory { Plugin plugin = (Plugin) constructor .newInstance(new Object[] { pDescriptor, this.conf }); plugin.startUp(); - fActivatedPlugins.put(pDescriptor.getPluginId(), plugin); + this.fActivatedPlugins.put(pDescriptor.getPluginId(), plugin); return plugin; } } catch (ClassNotFoundException e) { @@ -318,6 +319,7 @@ public class PluginRepository implements URLStreamHandlerFactory { * @deprecated * @see <a href="https://openjdk.java.net/jeps/421">JEP 421: Deprecate Finalization for Removal</a> * @see java.lang.Object#finalize() + * @deprecated */ @Deprecated public void finalize() throws Throwable { @@ -330,12 +332,13 @@ public class PluginRepository implements URLStreamHandlerFactory { * @throws PluginRuntimeException */ private void shutDownActivatedPlugins() throws PluginRuntimeException { - for (Plugin plugin : fActivatedPlugins.values()) { + for (Plugin plugin : this.fActivatedPlugins.values()) { plugin.shutDown(); } } - public Class getCachedClass(PluginDescriptor pDescriptor, String className) + @SuppressWarnings("rawtypes") + public static Class getCachedClass(PluginDescriptor pDescriptor, String className) throws ClassNotFoundException { Map<PluginClassLoader, Class> descMap = CLASS_CACHE.get(className); if (descMap == null) { @@ -355,19 +358,19 @@ public class PluginRepository implements URLStreamHandlerFactory { LOG.info("Plugin Auto-activation mode: [{}]", this.auto); LOG.info("Registered Plugins:"); - if ((fRegisteredPlugins == null) || (fRegisteredPlugins.size() == 0)) { + if ((this.fRegisteredPlugins == null) || (this.fRegisteredPlugins.size() == 0)) { LOG.info("\tNONE"); } else { - for (PluginDescriptor plugin : fRegisteredPlugins) { + for (PluginDescriptor plugin : this.fRegisteredPlugins) { LOG.info("\t{} ({})", plugin.getName(), plugin.getPluginId()); } } LOG.info("Registered Extension-Points:"); - if ((fExtensionPoints == null) || (fExtensionPoints.size() == 0)) { + if ((this.fExtensionPoints == null) || (this.fExtensionPoints.size() == 0)) { LOG.info("\tNONE"); } else { - for (ExtensionPoint ep : fExtensionPoints.values()) { + for (ExtensionPoint ep : this.fExtensionPoints.values()) { LOG.info("\t ({})", ep.getName(), ep.getId()); } } @@ -384,7 +387,7 @@ public class PluginRepository implements URLStreamHandlerFactory { * Map of plugins * @return map of plugins matching the configuration */ - private Map<String, PluginDescriptor> filter(Pattern excludes, + private static Map<String, PluginDescriptor> filter(Pattern excludes, Pattern includes, Map<String, PluginDescriptor> plugins) { Map<String, PluginDescriptor> map = new HashMap<>(); @@ -434,11 +437,11 @@ public class PluginRepository implements URLStreamHandlerFactory { public synchronized Object[] getOrderedPlugins(Class<?> clazz, String xPointId, String orderProperty) { Object[] filters; - ObjectCache objectCache = ObjectCache.get(conf); + ObjectCache objectCache = ObjectCache.get(this.conf); filters = (Object[]) objectCache.getObject(clazz.getName()); if (filters == null) { - String order = conf.get(orderProperty); + String order = this.conf.get(orderProperty); List<String> orderOfFilters = new ArrayList<>(); boolean userDefinedOrder = false; if (order != null && !order.trim().isEmpty()) { @@ -447,7 +450,7 @@ public class PluginRepository implements URLStreamHandlerFactory { } try { - ExtensionPoint point = PluginRepository.get(conf) + ExtensionPoint point = PluginRepository.get(this.conf) .getExtensionPoint(xPointId); if (point == null) throw new RuntimeException(xPointId + " not found."); @@ -579,8 +582,8 @@ public class PluginRepository implements URLStreamHandlerFactory { public URLStreamHandler createURLStreamHandler(String protocol) { LOG.debug("Creating URLStreamHandler for protocol: {}", protocol); - if (fExtensionPoints != null) { - ExtensionPoint ep = fExtensionPoints + if (this.fExtensionPoints != null) { + ExtensionPoint ep = this.fExtensionPoints .get("org.apache.nutch.protocol.Protocol"); if (ep != null) { Extension[] extensions = ep.getExtensions(); @@ -602,8 +605,8 @@ public class PluginRepository implements URLStreamHandlerFactory { // return the handler here, if possible String handlerClass = extension.getAttribute("urlStreamHandler"); - LOG.debug("Located URLStreamHandler: {}", handlerClass); if (handlerClass != null) { + LOG.debug("Located URLStreamHandler: {}", handlerClass); // the nutch classloader ClassLoader cl = this.getClass().getClassLoader(); if (extinst != null) { @@ -622,11 +625,11 @@ public class PluginRepository implements URLStreamHandlerFactory { } } - LOG.debug("suitable protocol extension found that did not declare a handler"); + LOG.debug("Suitable protocol extension found that did not declare a handler"); return null; } } - LOG.debug("No suitable protocol extensions registered"); + LOG.debug("No suitable protocol extensions registered for protocol: {}", protocol); } else { LOG.debug("No protocol extensions registered?"); } diff --git a/src/java/org/apache/nutch/plugin/URLStreamHandlerFactory.java b/src/java/org/apache/nutch/plugin/URLStreamHandlerFactory.java index a64454c5f..5aed76a35 100644 --- a/src/java/org/apache/nutch/plugin/URLStreamHandlerFactory.java +++ b/src/java/org/apache/nutch/plugin/URLStreamHandlerFactory.java @@ -59,7 +59,7 @@ public class URLStreamHandlerFactory } private URLStreamHandlerFactory() { - prs = new ArrayList<>(); + this.prs = new ArrayList<>(); } /** @@ -75,7 +75,7 @@ public class URLStreamHandlerFactory * @param pr The PluginRepository to be registered. */ public void registerPluginRepository(PluginRepository pr) { - prs.add(new WeakReference<PluginRepository>(pr)); + this.prs.add(new WeakReference<PluginRepository>(pr)); removeInvalidRefs(); } @@ -88,7 +88,7 @@ public class URLStreamHandlerFactory // find the 'correct' PluginRepository. For now we simply take the first. // then ask it to return the URLStreamHandler - for(WeakReference<PluginRepository> ref: prs) { + for(WeakReference<PluginRepository> ref: this.prs) { PluginRepository pr = ref.get(); if(pr != null) { // found PluginRepository. Let's get the URLStreamHandler... @@ -103,13 +103,12 @@ public class URLStreamHandlerFactory * garbage collected meanwhile. */ private void removeInvalidRefs() { - LOG.debug("removeInvalidRefs()"); - ArrayList<WeakReference<PluginRepository>> copy = new ArrayList<>(prs); + ArrayList<WeakReference<PluginRepository>> copy = new ArrayList<>(this.prs); for(WeakReference<PluginRepository> ref: copy) { if(ref.get() == null) { - prs.remove(ref); + this.prs.remove(ref); } } - LOG.debug("Removed the following invalid references: '{}' Remaining: '{}'", copy.size()-prs.size(), prs.size()); + LOG.debug("Removed '{}' invalid references. '{}' remaining.", copy.size()-this.prs.size(), this.prs.size()); } } diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java index ce999b374..1438754ce 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java @@ -36,6 +36,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.HttpHeaders; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.net.protocols.ProtocolLogUtil; import org.apache.nutch.net.protocols.Response; @@ -200,7 +201,7 @@ public abstract class HttpBase implements Protocol { if (logger != null) { this.logger = logger; } - robots = new HttpRobotRulesParser(); + this.robots = new HttpRobotRulesParser(); } @Override @@ -211,7 +212,7 @@ public abstract class HttpBase implements Protocol { this.proxyType = Proxy.Type.valueOf(conf.get("http.proxy.type", "HTTP")); this.proxyException = arrayToMap( conf.getStrings("http.proxy.exception.list")); - this.useProxy = (proxyHost != null && proxyHost.length() > 0); + this.useProxy = (this.proxyHost != null && this.proxyHost.length() > 0); this.timeout = conf.getInt("http.timeout", 10000); this.maxContent = conf.getInt("http.content.limit", 1024 * 1024); this.maxDuration = conf.getInt("http.time.limit", -1); @@ -219,10 +220,10 @@ public abstract class HttpBase implements Protocol { this.userAgent = getAgentString(conf.get("http.agent.name"), conf.get("http.agent.version"), conf.get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email")); - this.acceptLanguage = conf.get("http.accept.language", acceptLanguage) + this.acceptLanguage = conf.get("http.accept.language", this.acceptLanguage) .trim(); - this.acceptCharset = conf.get("http.accept.charset", acceptCharset).trim(); - this.accept = conf.get("http.accept", accept).trim(); + this.acceptCharset = conf.get("http.accept.charset", this.acceptCharset).trim(); + this.accept = conf.get("http.accept", this.accept).trim(); this.mimeTypes = new MimeUtil(conf); // backward-compatible default setting this.useHttp11 = conf.getBoolean("http.useHttp11", true); @@ -244,27 +245,28 @@ public abstract class HttpBase implements Protocol { // NUTCH-1941: read list of alternating agent names if (conf.getBoolean("http.agent.rotate", false)) { String agentsFile = conf.get("http.agent.rotate.file", "agents.txt"); + @SuppressWarnings("resource") BufferedReader br = null; try { Reader reader = conf.getConfResourceAsReader(agentsFile); br = new BufferedReader(reader); - userAgentNames = new ArrayList<String>(); + this.userAgentNames = new ArrayList<String>(); String word = ""; while ((word = br.readLine()) != null) { if (!word.trim().isEmpty()) - userAgentNames.add(word.trim()); + this.userAgentNames.add(word.trim()); } - if (userAgentNames.size() == 0) { - logger.warn("Empty list of user agents in http.agent.rotate.file {}", + if (this.userAgentNames.size() == 0) { + this.logger.warn("Empty list of user agents in http.agent.rotate.file {}", agentsFile); - userAgentNames = null; + this.userAgentNames = null; } } catch (Exception e) { - logger.warn("Failed to read http.agent.rotate.file {}: {}", agentsFile, + this.logger.warn("Failed to read http.agent.rotate.file {}: {}", agentsFile, StringUtils.stringifyException(e)); - userAgentNames = null; + this.userAgentNames = null; } finally { if (br != null) { try { @@ -274,28 +276,29 @@ public abstract class HttpBase implements Protocol { } } } - if (userAgentNames == null) { - logger.warn( + if (this.userAgentNames == null) { + this.logger.warn( "Falling back to fixed user agent set via property http.agent.name"); } } // If cookies are enabled, try to load a per-host cookie file - if (enableCookieHeader) { + if (this.enableCookieHeader) { String cookieFile = conf.get("http.agent.host.cookie.file", "cookies.txt"); + @SuppressWarnings("resource") BufferedReader br = null; try { Reader reader = conf.getConfResourceAsReader(cookieFile); br = new BufferedReader(reader); - hostCookies = new HashMap<String, String>(); + this.hostCookies = new HashMap<String, String>(); String word = ""; while ((word = br.readLine()) != null) { if (!word.trim().isEmpty()) { if (word.indexOf("#") == -1) { // skip comment String[] parts = word.split("\t"); if (parts.length == 2) { - hostCookies.put(parts[0], parts[1]); + this.hostCookies.put(parts[0], parts[1]); } else { LOG.warn("Unable to parse cookie file correctly at: " + word); } @@ -303,9 +306,9 @@ public abstract class HttpBase implements Protocol { } } } catch (Exception e) { - logger.warn("Failed to read http.agent.host.cookie.file {}: {}", + this.logger.warn("Failed to read http.agent.host.cookie.file {}: {}", cookieFile, StringUtils.stringifyException(e)); - hostCookies = null; + this.hostCookies = null; } finally { if (br != null) { try { @@ -368,8 +371,8 @@ public abstract class HttpBase implements Protocol { "TLS_CHACHA20_POLY1305_SHA256", "TLS_AES_128_GCM_SHA256", "TLS_AES_128_CCM_8_SHA256", "TLS_AES_128_CCM_SHA256"); - tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols)); - tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers)); + this.tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols)); + this.tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers)); logConf(); } @@ -401,7 +404,7 @@ public abstract class HttpBase implements Protocol { byte[] content = response.getContent(); Content c = new Content(u.toString(), u.toString(), (content == null ? EMPTY_CONTENT : content), - response.getHeader("Content-Type"), response.getHeaders(), mimeTypes); + response.getHeader("Content-Type"), response.getHeaders(), this.mimeTypes); if (code == 200) { // got a good response return new ProtocolOutput(c); // return it @@ -437,15 +440,15 @@ public abstract class HttpBase implements Protocol { // handle this in the higher layer. return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u)); } else if (code == 400) { // bad request, mark as GONE - if (logger.isTraceEnabled()) { - logger.trace("400 Bad request: " + u); + if (this.logger.isTraceEnabled()) { + this.logger.trace("400 Bad request: " + u); } return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u)); } else if (code == 401) { // requires authorization, but no valid auth // provided. - if (logger.isTraceEnabled()) { - logger.trace("401 Authentication Required"); + if (this.logger.isTraceEnabled()) { + this.logger.trace("401 Authentication Required"); } return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, @@ -461,10 +464,10 @@ public abstract class HttpBase implements Protocol { ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u)); } } catch (Throwable e) { - if (logger.isDebugEnabled() || !logUtil.logShort(e)) { - logger.error("Failed to get protocol output", e); + if (this.logger.isDebugEnabled() || !this.logUtil.logShort(e)) { + this.logger.error("Failed to get protocol output", e); } else { - logger.error("Failed to get protocol output: {}", + this.logger.error("Failed to get protocol output: {}", e.getClass().getName()); } return new ProtocolOutput(null, new ProtocolStatus(e)); @@ -477,11 +480,11 @@ public abstract class HttpBase implements Protocol { */ public String getProxyHost() { - return proxyHost; + return this.proxyHost; } public int getProxyPort() { - return proxyPort; + return this.proxyPort; } public boolean useProxy(URL url) { @@ -493,38 +496,38 @@ public abstract class HttpBase implements Protocol { } public boolean useProxy(String host) { - if (useProxy && proxyException.containsKey(host)) { + if (this.useProxy && this.proxyException.containsKey(host)) { return false; } - return useProxy; + return this.useProxy; } public int getTimeout() { - return timeout; + return this.timeout; } public boolean isIfModifiedSinceEnabled() { - return enableIfModifiedsinceHeader; + return this.enableIfModifiedsinceHeader; } public boolean isCookieEnabled() { - return enableCookieHeader; + return this.enableCookieHeader; } public boolean isStoreIPAddress() { - return storeIPAddress; + return this.storeIPAddress; } public boolean isStoreHttpRequest() { - return storeHttpRequest; + return this.storeHttpRequest; } public boolean isStoreHttpHeaders() { - return storeHttpHeaders; + return this.storeHttpHeaders; } public int getMaxContent() { - return maxContent; + return this.maxContent; } /** @@ -533,7 +536,7 @@ public abstract class HttpBase implements Protocol { * @return the maximum duration */ public int getMaxDuration() { - return maxDuration; + return this.maxDuration; } /** @@ -542,15 +545,15 @@ public abstract class HttpBase implements Protocol { * @return true if partially fetched truncated content is stored */ public boolean isStorePartialAsTruncated() { - return partialAsTruncated; + return this.partialAsTruncated; } public String getUserAgent() { - if (userAgentNames != null) { - return userAgentNames - .get(ThreadLocalRandom.current().nextInt(userAgentNames.size())); + if (this.userAgentNames != null) { + return this.userAgentNames + .get(ThreadLocalRandom.current().nextInt(this.userAgentNames.size())); } - return userAgent; + return this.userAgent; } /** @@ -562,8 +565,8 @@ public abstract class HttpBase implements Protocol { * @return the cookie or null */ public String getCookie(URL url) { - if (hostCookies != null) { - return hostCookies.get(url.getHost()); + if (this.hostCookies != null) { + return this.hostCookies.get(url.getHost()); } return null; @@ -575,38 +578,37 @@ public abstract class HttpBase implements Protocol { * @return The value of the header "Accept-Language" header. */ public String getAcceptLanguage() { - return acceptLanguage; + return this.acceptLanguage; } public String getAcceptCharset() { - return acceptCharset; + return this.acceptCharset; } public String getAccept() { - return accept; + return this.accept; } public boolean getUseHttp11() { - return useHttp11; + return this.useHttp11; } public boolean isTlsCheckCertificates() { - return tlsCheckCertificate; + return this.tlsCheckCertificate; } public Set<String> getTlsPreferredCipherSuites() { - return tlsPreferredCipherSuites; + return this.tlsPreferredCipherSuites; } public Set<String> getTlsPreferredProtocols() { - return tlsPreferredProtocols; + return this.tlsPreferredProtocols; } private static String getAgentString(String agentName, String agentVersion, String agentDesc, String agentURL, String agentEmail) { if ((agentName == null) || (agentName.trim().length() == 0)) { - // TODO : NUTCH-258 if (LOG.isErrorEnabled()) { LOG.error("No User-Agent string set (http.agent.name)!"); } @@ -645,16 +647,16 @@ public abstract class HttpBase implements Protocol { } protected void logConf() { - if (logger.isInfoEnabled()) { - logger.info("http.proxy.host = " + proxyHost); - logger.info("http.proxy.port = " + proxyPort); - logger.info("http.proxy.exception.list = " + useProxy); - logger.info("http.timeout = " + timeout); - logger.info("http.content.limit = " + maxContent); - logger.info("http.agent = " + userAgent); - logger.info("http.accept.language = " + acceptLanguage); - logger.info("http.accept = " + accept); - logger.info("http.enable.cookie.header = " + isCookieEnabled()); + if (this.logger.isInfoEnabled()) { + this.logger.info("http.proxy.host = " + this.proxyHost); + this.logger.info("http.proxy.port = " + this.proxyPort); + this.logger.info("http.proxy.exception.list = " + this.useProxy); + this.logger.info("http.timeout = " + this.timeout); + this.logger.info("http.content.limit = " + this.maxContent); + this.logger.info("http.agent = " + this.userAgent); + this.logger.info("http.accept.language = " + this.acceptLanguage); + this.logger.info("http.accept = " + this.accept); + this.logger.info("http.enable.cookie.header = " + isCookieEnabled()); } } @@ -748,7 +750,7 @@ public abstract class HttpBase implements Protocol { if (content != null) { System.out.println("Content Type: " + content.getContentType()); System.out.println("Content Length: " - + content.getMetadata().get(Response.CONTENT_LENGTH)); + + content.getMetadata().get(HttpHeaders.CONTENT_LENGTH)); System.out.println("Content:"); String text = new String(content.getContent()); System.out.println(text); @@ -761,7 +763,7 @@ public abstract class HttpBase implements Protocol { @Override public BaseRobotRules getRobotRules(Text url, CrawlDatum datum, List<Content> robotsTxtContent) { - return robots.getRobotRulesSet(this, url, robotsTxtContent); + return this.robots.getRobotRulesSet(this, url, robotsTxtContent); } /** @@ -771,7 +773,7 @@ public abstract class HttpBase implements Protocol { * String[] * @return a new HashMap */ - private HashMap<String, String> arrayToMap(String[] input) { + private static HashMap<String, String> arrayToMap(String[] input) { if (input == null || input.length == 0) { return new HashMap<String, String>(); } diff --git a/src/plugin/protocol-foo/plugin.xml b/src/plugin/protocol-foo/plugin.xml index 850afe33f..d34f6242a 100755 --- a/src/plugin/protocol-foo/plugin.xml +++ b/src/plugin/protocol-foo/plugin.xml @@ -40,7 +40,7 @@ <implementation id="org.apache.nutch.protocol.foo.Foo" class="org.apache.nutch.protocol.foo.Foo"> <parameter name="protocolName" value="foo"/> - <parameter name="urlStreamHandler" value="org.apache.nutch.protocol.foo.Handler"/> + <parameter name="urlStreamHandler" value="org.apache.nutch.protocol.foo.Handler"/> </implementation> </extension> diff --git a/src/plugin/protocol-okhttp/ivy.xml b/src/plugin/protocol-okhttp/ivy.xml index 36befcf45..ead823247 100644 --- a/src/plugin/protocol-okhttp/ivy.xml +++ b/src/plugin/protocol-okhttp/ivy.xml @@ -36,8 +36,8 @@ </publications> <dependencies> - <dependency org="com.squareup.okhttp3" name="okhttp" rev="4.9.1"/> - <dependency org="com.squareup.okhttp3" name="okhttp-brotli" rev="4.9.1"/> + <dependency org="com.squareup.okhttp3" name="okhttp" rev="4.9.3"/> + <dependency org="com.squareup.okhttp3" name="okhttp-brotli" rev="4.9.3"/> </dependencies> </ivy-module> diff --git a/src/plugin/protocol-okhttp/plugin.xml b/src/plugin/protocol-okhttp/plugin.xml index 5138a155e..e2183d2b5 100755 --- a/src/plugin/protocol-okhttp/plugin.xml +++ b/src/plugin/protocol-okhttp/plugin.xml @@ -26,15 +26,15 @@ <export name="*"/> </library> <!-- dependencies of OkHttp --> - <library name="okhttp-4.9.1.jar"/> - <library name="okhttp-brotli-4.9.1.jar"/> - <library name="okio-2.8.0.jar"/> + <library name="annotations-13.0.jar"/> + <library name="dec-0.1.2.jar"/> <library name="kotlin-stdlib-1.4.10.jar"/> <library name="kotlin-stdlib-common-1.4.10.jar"/> <library name="kotlin-stdlib-jdk7-1.4.10.jar"/> <library name="kotlin-stdlib-jdk8-1.4.10.jar"/> - <library name="annotations-13.0.jar"/> - <library name="dec-0.1.2.jar"/> + <library name="okhttp-4.9.3.jar"/> + <library name="okhttp-brotli-4.9.3.jar"/> + <library name="okio-2.8.0.jar"/> <!-- end of dependencies of OkHttp --> </runtime> @@ -54,7 +54,7 @@ <implementation id="org.apache.nutch.protocol.okhttp.OkHttp" class="org.apache.nutch.protocol.okhttp.OkHttp"> - <parameter name="protocolName" value="https"/> + <parameter name="protocolName" value="https"/> </implementation> </extension> diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java index 65cb2d394..d5ab77ec5 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java @@ -112,7 +112,7 @@ public class OkHttp extends HttpBase { // protocols in order of preference List<okhttp3.Protocol> protocols = new ArrayList<>(); - if (useHttp2) { + if (this.useHttp2) { protocols.add(okhttp3.Protocol.HTTP_2); } protocols.add(okhttp3.Protocol.HTTP_1_1); @@ -121,11 +121,11 @@ public class OkHttp extends HttpBase { .protocols(protocols) // .retryOnConnectionFailure(true) // .followRedirects(false) // - .connectTimeout(timeout, TimeUnit.MILLISECONDS) - .writeTimeout(timeout, TimeUnit.MILLISECONDS) - .readTimeout(timeout, TimeUnit.MILLISECONDS); + .connectTimeout(this.timeout, TimeUnit.MILLISECONDS) + .writeTimeout(this.timeout, TimeUnit.MILLISECONDS) + .readTimeout(this.timeout, TimeUnit.MILLISECONDS); - if (!tlsCheckCertificate) { + if (!this.tlsCheckCertificate) { builder.sslSocketFactory(trustAllSslSocketFactory, (X509TrustManager) trustAllCerts[0]); builder.hostnameVerifier(new HostnameVerifier() { @@ -136,23 +136,23 @@ public class OkHttp extends HttpBase { }); } - if (!accept.isEmpty()) { - getCustomRequestHeaders().add(new String[] { "Accept", accept }); + if (!this.accept.isEmpty()) { + getCustomRequestHeaders().add(new String[] { "Accept", this.accept }); } - if (!acceptLanguage.isEmpty()) { + if (!this.acceptLanguage.isEmpty()) { getCustomRequestHeaders() - .add(new String[] { "Accept-Language", acceptLanguage }); + .add(new String[] { "Accept-Language", this.acceptLanguage }); } - if (!acceptCharset.isEmpty()) { + if (!this.acceptCharset.isEmpty()) { getCustomRequestHeaders() - .add(new String[] { "Accept-Charset", acceptCharset }); + .add(new String[] { "Accept-Charset", this.acceptCharset }); } - if (useProxy) { - Proxy proxy = new Proxy(proxyType, - new InetSocketAddress(proxyHost, proxyPort)); + if (this.useProxy) { + Proxy proxy = new Proxy(this.proxyType, + new InetSocketAddress(this.proxyHost, this.proxyPort)); String proxyUsername = conf.get("http.proxy.username"); if (proxyUsername == null) { ProxySelector selector = new ProxySelector() { @@ -172,9 +172,9 @@ public class OkHttp extends HttpBase { @Override public List<Proxy> select(URI uri) { if (useProxy(uri)) { - return proxyList; + return this.proxyList; } - return noProxyList; + return this.noProxyList; } @Override @@ -192,7 +192,7 @@ public class OkHttp extends HttpBase { * ProxySelector class with proxy auth. If a proxy username is present, * the configured proxy will be used for ALL requests. */ - if (proxyException.size() > 0) { + if (this.proxyException.size() > 0) { LOG.warn( "protocol-okhttp does not respect 'http.proxy.exception.list' setting when " + "'http.proxy.username' is set. This is a limitation of the current okhttp3 " @@ -214,14 +214,14 @@ public class OkHttp extends HttpBase { } } - if (storeIPAddress || storeHttpHeaders || storeHttpRequest) { + if (this.storeIPAddress || this.storeHttpHeaders || this.storeHttpRequest) { builder.addNetworkInterceptor(new HTTPHeadersInterceptor()); } // enable support for Brotli compression (Content-Encoding) builder.addInterceptor(BrotliInterceptor.INSTANCE); - client = builder.build(); + this.client = builder.build(); } class HTTPHeadersInterceptor implements Interceptor { @@ -241,7 +241,7 @@ public class OkHttp extends HttpBase { Connection connection = chain.connection(); String ipAddress = null; - if (storeIPAddress) { + if (OkHttp.this.storeIPAddress) { InetAddress address = connection.socket().getInetAddress(); ipAddress = address.getHostAddress(); } @@ -252,7 +252,7 @@ public class OkHttp extends HttpBase { StringBuilder requestverbatim = null; StringBuilder responseverbatim = null; - if (storeHttpRequest) { + if (OkHttp.this.storeHttpRequest) { requestverbatim = new StringBuilder(); requestverbatim.append(request.method()).append(' '); @@ -277,7 +277,7 @@ public class OkHttp extends HttpBase { requestverbatim.append("\r\n"); } - if (storeHttpHeaders) { + if (OkHttp.this.storeHttpHeaders) { responseverbatim = new StringBuilder(); responseverbatim.append(getNormalizedProtocolName(response.protocol())) @@ -322,11 +322,11 @@ public class OkHttp extends HttpBase { } protected List<String[]> getCustomRequestHeaders() { - return customRequestHeaders; + return this.customRequestHeaders; } protected OkHttpClient getClient() { - return client; + return this.client; } @Override diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java index 54675e3b2..6dcbe16ed 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java @@ -55,15 +55,15 @@ public class OkHttpResponse implements Response { } public void setReason(TruncatedContentReason val) { - value = val; + this.value = val; } public TruncatedContentReason getReason() { - return value; + return this.value; } public boolean booleanValue() { - return value != TruncatedContentReason.NOT_TRUNCATED; + return this.value != TruncatedContentReason.NOT_TRUNCATED; } } @@ -123,7 +123,7 @@ public class OkHttpResponse implements Response { response.message()); TruncatedContent truncated = new TruncatedContent(); - content = toByteArray(response.body(), truncated, okhttp.getMaxContent(), + this.content = toByteArray(response.body(), truncated, okhttp.getMaxContent(), okhttp.getMaxDuration(), okhttp.isStorePartialAsTruncated()); responsemetadata.add(FETCH_TIME, Long.toString(System.currentTimeMillis())); @@ -135,11 +135,11 @@ public class OkHttpResponse implements Response { responsemetadata.set(TRUNCATED_CONTENT_REASON, truncated.getReason().toString().toLowerCase(Locale.ROOT)); LOG.debug("HTTP content truncated to {} bytes (reason: {})", - content.length, truncated.getReason()); + this.content.length, truncated.getReason()); } - code = response.code(); - headers = responsemetadata; + this.code = response.code(); + this.headers = responsemetadata; } } @@ -179,7 +179,7 @@ public class OkHttpResponse implements Response { if (partialAsTruncated && source.getBuffer().size() > 0) { // treat already fetched content as truncated truncated.setReason(TruncatedContentReason.DISCONNECT); - LOG.info("Truncated content for {}, partial fetch caused by:", url, + LOG.info("Truncated content for {}, partial fetch caused by:", this.url, e); } else { throw e; @@ -222,23 +222,23 @@ public class OkHttpResponse implements Response { } public URL getUrl() { - return url; + return this.url; } public int getCode() { - return code; + return this.code; } public String getHeader(String name) { - return headers.get(name); + return this.headers.get(name); } public Metadata getHeaders() { - return headers; + return this.headers; } public byte[] getContent() { - return content; + return this.content; } }