This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit a73bd14ab37a5c7b4004fdc0e05d86e810baed74
Author: Sebastian Nagel <[email protected]>
AuthorDate: Fri Jul 10 16:01:18 2020 +0200

    [NUTCH-2801] RobotsRulesParser command-line checker to use 
http.robots.agents as fall-back
    - clarify comment regarding bypassing the confidence check for a non-empty 
http.agent.name
---
 src/java/org/apache/nutch/protocol/RobotRulesParser.java | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java 
b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
index 6889216..2cb52a6 100644
--- a/src/java/org/apache/nutch/protocol/RobotRulesParser.java
+++ b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
@@ -380,8 +380,14 @@ public abstract class RobotRulesParser implements Tool {
   private static class TestRobotRulesParser extends RobotRulesParser {
 
     public void setConf(Configuration conf) {
-      // make sure that agent name is set so that setConf() does not complain,
-      // the agent name is later overwritten by command-line argument
+      /*
+       * Make sure that agent name is not empty so that
+       * RobotRulesParser.setConf() does not complain.
+       * 
+       * If provided the agent names passed as command-line argument are
+       * checked, see RobotRulesParser.run(...). Also http.agent.name is then
+       * filled taking the first agent name from command-line.
+       */
       if (conf.get("http.agent.name", "").isEmpty()) {
         String firstRobotsAgent = conf.get("http.robots.agents", 
"").split(",")[0].trim();
         if (firstRobotsAgent.isEmpty()) {

Reply via email to