This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit a73bd14ab37a5c7b4004fdc0e05d86e810baed74 Author: Sebastian Nagel <[email protected]> AuthorDate: Fri Jul 10 16:01:18 2020 +0200 [NUTCH-2801] RobotsRulesParser command-line checker to use http.robots.agents as fall-back - clarify comment regarding bypassing the confidence check for a non-empty http.agent.name --- src/java/org/apache/nutch/protocol/RobotRulesParser.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java b/src/java/org/apache/nutch/protocol/RobotRulesParser.java index 6889216..2cb52a6 100644 --- a/src/java/org/apache/nutch/protocol/RobotRulesParser.java +++ b/src/java/org/apache/nutch/protocol/RobotRulesParser.java @@ -380,8 +380,14 @@ public abstract class RobotRulesParser implements Tool { private static class TestRobotRulesParser extends RobotRulesParser { public void setConf(Configuration conf) { - // make sure that agent name is set so that setConf() does not complain, - // the agent name is later overwritten by command-line argument + /* + * Make sure that agent name is not empty so that + * RobotRulesParser.setConf() does not complain. + * + * If provided the agent names passed as command-line argument are + * checked, see RobotRulesParser.run(...). Also http.agent.name is then + * filled taking the first agent name from command-line. + */ if (conf.get("http.agent.name", "").isEmpty()) { String firstRobotsAgent = conf.get("http.robots.agents", "").split(",")[0].trim(); if (firstRobotsAgent.isEmpty()) {
