Author: schuch
Date: Thu Mar  2 06:00:55 2017
New Revision: 1785071

URL: http://svn.apache.org/viewvc?rev=1785071&view=rev
Log:
CONNECTORS-1392: Adds option for webcrawler to ignore meta robots

Added:
    
manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ProcessActivityHTMLHandlerTest.java
   (with props)
Modified:
    manifoldcf/trunk/CHANGES.txt
    
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
    
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
    
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties
    
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_es_ES.properties
    
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties
    
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties
    
manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/tests/NavigationHSQLDBUI.java
    manifoldcf/trunk/connectors/webcrawler/pom.xml
    
manifoldcf/trunk/site/src/documentation/content/xdocs/en_US/end-user-documentation.xml
    
manifoldcf/trunk/site/src/documentation/resources/images/en_US/web-configure-robots.PNG

Modified: manifoldcf/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1785071&r1=1785070&r2=1785071&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Thu Mar  2 06:00:55 2017
@@ -3,6 +3,10 @@ $Id$
 
 ======================= 2.7-dev =====================
 
+CONNECTORS-1392: The Webcrawler connector now has an option to ignore
+meta robots tags in HTML pages
+(Markus Schuch)
+
 CONNECTORS-1390: Add user-settable connect timeout field for
 Active Directory authority.
 (Cihad Guzel, Karl Wright)

Modified: 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java?rev=1785071&r1=1785070&r2=1785071&view=diff
==============================================================================
--- 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
 (original)
+++ 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
 Thu Mar  2 06:00:55 2017
@@ -44,6 +44,8 @@ public class WebcrawlerConfig
 
   /** Robots usage (a parameter) */
   public static final String PARAMETER_ROBOTSUSAGE = "Robots usage";
+  /** Meta robots tags usage (a parameter) */
+  public static final String PARAMETER_META_ROBOTS_TAGS_USAGE = "Meta robots 
tags usage";
   /** Email (a parameter) */
   public static final String PARAMETER_EMAIL = "Email address";
   /** Proxy host name (parameter) */

Modified: 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1785071&r1=1785070&r2=1785071&view=diff
==============================================================================
--- 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
 (original)
+++ 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
 Thu Mar  2 06:00:55 2017
@@ -123,6 +123,9 @@ public class WebcrawlerConnector extends
   protected static final int ROBOTS_DATA = 1;
   protected static final int ROBOTS_ALL = 2;
 
+  protected static final int META_ROBOTS_NONE = 0;
+  protected static final int META_ROBOTS_ALL = 1;
+
   // Relationship types
   public final static String REL_LINK = "link";
   public final static String REL_REDIRECT = "redirect";
@@ -162,6 +165,8 @@ public class WebcrawlerConnector extends
   
   /** Robots usage flag */
   protected int robotsUsage = ROBOTS_ALL;
+  /** Meta robots tag usage flag */
+  protected int metaRobotsTagsUsage = META_ROBOTS_ALL;
   /** The user-agent for this connector instance */
   protected String userAgent = null;
   /** The email address for this connector instance */
@@ -364,8 +369,6 @@ public class WebcrawlerConnector extends
     // Handle everything else
     if (!isInitialized)
     {
-      String x;
-
       // Either set this from the connection name, or just have one.  Right 
now, we have one.
       String throttleGroupName = "";
       
@@ -375,15 +378,21 @@ public class WebcrawlerConnector extends
       userAgent = "Mozilla/5.0 (ApacheManifoldCFWebCrawler; "+emailAddress+")";
       from = emailAddress;
 
-      x = params.getParameter(WebcrawlerConfig.PARAMETER_ROBOTSUSAGE);
+      String robotsTxt = 
params.getParameter(WebcrawlerConfig.PARAMETER_ROBOTSUSAGE);
       robotsUsage = ROBOTS_ALL;
-      if (x == null || x.length() == 0 || x.equals("all"))
+      if (robotsTxt == null || robotsTxt.length() == 0 || 
robotsTxt.equals("all"))
         robotsUsage = ROBOTS_ALL;
-      else if (x.equals("none"))
+      else if (robotsTxt.equals("none"))
         robotsUsage = ROBOTS_NONE;
-      else if (x.equals("data"))
+      else if (robotsTxt.equals("data"))
         robotsUsage = ROBOTS_DATA;
 
+      String metaRobots = 
params.getParameter(WebcrawlerConfig.PARAMETER_META_ROBOTS_TAGS_USAGE);
+      if (metaRobots == null || metaRobots.length() == 0 || 
metaRobots.equals("all"))
+        metaRobotsTagsUsage = META_ROBOTS_ALL;
+      else if (metaRobots.equals("none"))
+        metaRobotsTagsUsage = META_ROBOTS_NONE;
+      
       throttleDescription = new ThrottleDescription(params);
       credentialsDescription = new CredentialsDescription(params);
       trustsDescription = new TrustsDescription(params);
@@ -1889,6 +1898,9 @@ public class WebcrawlerConnector extends
     String robotsUsage = 
parameters.getParameter(WebcrawlerConfig.PARAMETER_ROBOTSUSAGE);
     if (robotsUsage == null)
       robotsUsage = "all";
+    String metaRobotsTagsUsage = 
parameters.getParameter(WebcrawlerConfig.PARAMETER_META_ROBOTS_TAGS_USAGE);
+    if (metaRobotsTagsUsage == null)
+      metaRobotsTagsUsage = "all";
     String proxyHost = 
parameters.getParameter(WebcrawlerConfig.PARAMETER_PROXYHOST);
     if (proxyHost == null)
       proxyHost = "";
@@ -1985,13 +1997,23 @@ public class WebcrawlerConnector extends
 "      </select>\n"+
 "    </td>\n"+
 "  </tr>\n"+
+"  <tr>\n"+
+"    <td class=\"description\"><nobr>" + 
Messages.getBodyString(locale,"WebcrawlerConnector.MetaRobotsTagsUsage") + 
"</nobr></td>\n"+
+"    <td class=\"value\">\n"+
+"      <select name=\"metarobotstagsusage\" size=\"3\">\n"+
+"        <option value=\"none\" 
"+(metaRobotsTagsUsage.equals("none")?"selected=\"selected\"":"")+">" + 
Messages.getBodyString(locale,"WebcrawlerConnector.DontLookAtMetaRobotsTags") + 
"</option>\n"+
+"        <option value=\"all\" 
"+(metaRobotsTagsUsage.equals("all")?"selected=\"selected\"":"")+">" + 
Messages.getBodyString(locale,"WebcrawlerConnector.ObeyMetaRobotsTags") + 
"</option>\n"+
+"      </select>\n"+
+"    </td>\n"+
+"  </tr>\n"+
 "</table>\n"
       );
     }
     else
     {
       out.print(
-"<input type=\"hidden\" name=\"robotsusage\" value=\""+robotsUsage+"\"/>\n"
+"<input type=\"hidden\" name=\"robotsusage\" value=\""+robotsUsage+"\"/>\n"+
+"<input type=\"hidden\" name=\"metarobotstagsusage\" 
value=\""+metaRobotsTagsUsage+"\"/>\n"
       );
     }
 
@@ -2883,6 +2905,9 @@ public class WebcrawlerConnector extends
     String robotsUsage = variableContext.getParameter("robotsusage");
     if (robotsUsage != null)
       
parameters.setParameter(WebcrawlerConfig.PARAMETER_ROBOTSUSAGE,robotsUsage);
+    String obeyMetaRobotsTags = 
variableContext.getParameter("metarobotstagsusage");
+    if (obeyMetaRobotsTags != null)
+      
parameters.setParameter(WebcrawlerConfig.PARAMETER_META_ROBOTS_TAGS_USAGE, 
obeyMetaRobotsTags);
     String proxyHost = variableContext.getParameter("proxyhost");
     if (proxyHost != null)
       parameters.setParameter(WebcrawlerConfig.PARAMETER_PROXYHOST,proxyHost);
@@ -3277,11 +3302,16 @@ public class WebcrawlerConnector extends
     String email = parameters.getParameter(WebcrawlerConfig.PARAMETER_EMAIL);
     String robots = 
parameters.getParameter(WebcrawlerConfig.PARAMETER_ROBOTSUSAGE);
     if (robots.equals("none"))
-      robots = "Ignore robots.txt";
+      robots = 
Messages.getBodyString(locale,"WebcrawlerConnector.DontLookAtRobotsTxt");
     else if (robots.equals("data"))
-      robots = "Obey robots.txt for data fetches only";
+      robots = 
Messages.getBodyString(locale,"WebcrawlerConnector.ObeyRobotsTxtForDataFetchesOnly");
     else if (robots.equals("all"))
-      robots = "Obey robots.txt for all fetches";
+      robots = 
Messages.getBodyString(locale,"WebcrawlerConnector.ObeyRobotsTxtForAllFetches");
+    String metaRobotsTagsUsage = 
parameters.getParameter(WebcrawlerConfig.PARAMETER_META_ROBOTS_TAGS_USAGE);
+    if (metaRobotsTagsUsage == null || metaRobotsTagsUsage.equals("all"))
+      metaRobotsTagsUsage = 
Messages.getBodyString(locale,"WebcrawlerConnector.ObeyMetaRobotsTags");
+    else if (metaRobotsTagsUsage.equals("none"))
+      metaRobotsTagsUsage = 
Messages.getBodyString(locale,"WebcrawlerConnector.DontLookAtMetaRobotsTags");
     String proxyHost = 
parameters.getParameter(WebcrawlerConfig.PARAMETER_PROXYHOST);
     if (proxyHost == null)
       proxyHost = "";
@@ -3299,9 +3329,13 @@ public class WebcrawlerConnector extends
 "<table class=\"displaytable\">\n"+
 "  <tr>\n"+
 "    <td class=\"description\" 
colspan=\"1\"><nobr>"+Messages.getBodyString(locale,"WebcrawlerConnector.EmailAddress")+"</nobr></td>\n"+
-"    <td class=\"value\" colspan=\"1\">"+Encoder.bodyEscape(email)+"</td>\n"+
+"    <td class=\"value\" colspan=\"3\">"+Encoder.bodyEscape(email)+"</td>\n"+
+"  </tr>\n"+
+"  <tr>\n"+
 "    <td class=\"description\" 
colspan=\"1\"><nobr>"+Messages.getBodyString(locale,"WebcrawlerConnector.RobotsUsage")+"</nobr></td>\n"+
 "    <td class=\"value\" 
colspan=\"1\"><nobr>"+Encoder.bodyEscape(robots)+"</nobr></td>\n"+
+"    <td class=\"description\" 
colspan=\"1\"><nobr>"+Messages.getBodyString(locale,"WebcrawlerConnector.MetaRobotsTagsUsage")+"</nobr></td>\n"+
+"    <td class=\"value\" 
colspan=\"1\">"+Encoder.bodyEscape(metaRobotsTagsUsage)+"</td>\n"+
 "  </tr>\n"+
 "  <tr>\n"+
 "    <td class=\"description\"><nobr>" + 
Messages.getBodyString(locale,"WebcrawlerConnector.ProxyHostColon") + 
"</nobr></td>\n"+
@@ -6048,7 +6082,7 @@ public class WebcrawlerConnector extends
     if (Logging.connectors.isDebugEnabled() && redirectHandler.shouldIndex() 
== false)
       Logging.connectors.debug("Web: Not indexing document 
'"+documentIdentifier+"' because of redirection");
     // For html, we don't want any actions, because we don't do form 
submission.
-    ProcessActivityHTMLHandler htmlHandler = new 
ProcessActivityHTMLHandler(documentIdentifier,activities,filter);
+    ProcessActivityHTMLHandler htmlHandler = new 
ProcessActivityHTMLHandler(documentIdentifier,activities,filter,metaRobotsTagsUsage);
     handleHTML(documentIdentifier,htmlHandler);
     if (Logging.connectors.isDebugEnabled() && htmlHandler.shouldIndex() == 
false)
       Logging.connectors.debug("Web: Not indexing document 
'"+documentIdentifier+"' because of HTML robots or content tags prohibiting 
indexing");
@@ -6124,11 +6158,13 @@ public class WebcrawlerConnector extends
   {
     boolean allowIndex = true;
     boolean allowFollow = true;
+    boolean obeyMetaRobotsTags = true;
     
     /** Constructor. */
-    public ProcessActivityHTMLHandler(String documentIdentifier, 
IProcessActivity activities, DocumentURLFilter filter)
+    public ProcessActivityHTMLHandler(String documentIdentifier, 
IProcessActivity activities, DocumentURLFilter filter, int metaRobotTagsUsage)
     {
       super(documentIdentifier,activities,filter,"html",REL_LINK);
+      this.obeyMetaRobotsTags = metaRobotTagsUsage == META_ROBOTS_ALL;
     }
 
     /** Decide whether we should index. */
@@ -6152,7 +6188,7 @@ public class WebcrawlerConnector extends
       throws ManifoldCFException
     {
       String name = (String)metaAttributes.get("name");
-      if (name != null && name.toLowerCase(Locale.ROOT).equals("robots"))
+      if (obeyMetaRobotsTags && name != null && 
name.toLowerCase(Locale.ROOT).equals("robots"))
       {
         String contentValue = (String)metaAttributes.get("content");
         if (contentValue != null)

Modified: 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties?rev=1785071&r1=1785070&r2=1785071&view=diff
==============================================================================
--- 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties
 (original)
+++ 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties
 Thu Mar  2 06:00:55 2017
@@ -29,6 +29,9 @@ WebcrawlerConnector.RobotsTxtUsage=Robot
 WebcrawlerConnector.DontLookAtRobotsTxt=Don't look at robots.txt
 WebcrawlerConnector.ObeyRobotsTxtForDataFetchesOnly=Obey robots.txt for data 
fetches only
 WebcrawlerConnector.ObeyRobotsTxtForAllFetches=Obey robots.txt for all fetches
+WebcrawlerConnector.MetaRobotsTagsUsage=Meta robots tags usage:
+WebcrawlerConnector.DontLookAtMetaRobotsTags=Don't look at meta robots tags
+WebcrawlerConnector.ObeyMetaRobotsTags=Obey meta robots tags
 WebcrawlerConnector.Throttles=Throttles:
 WebcrawlerConnector.BinRegularExpression=Bin regular expression
 WebcrawlerConnector.CaseInsensitive=Case insensitive?

Modified: 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_es_ES.properties
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_es_ES.properties?rev=1785071&r1=1785070&r2=1785071&view=diff
==============================================================================
--- 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_es_ES.properties
 (original)
+++ 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_es_ES.properties
 Thu Mar  2 06:00:55 2017
@@ -29,6 +29,9 @@ WebcrawlerConnector.RobotsTxtUsage=uso r
 WebcrawlerConnector.DontLookAtRobotsTxt=No mire robots.txt
 WebcrawlerConnector.ObeyRobotsTxtForDataFetchesOnly=Obedecer robots.txt para 
datos obtiene solamente
 WebcrawlerConnector.ObeyRobotsTxtForAllFetches=Obedecer robots.txt para todas 
las recuperaciones
+WebcrawlerConnector.MetaRobotsTagsUsage=Meta robots tags usage:
+WebcrawlerConnector.DontLookAtMetaRobotsTags=Don't look at meta robots tags
+WebcrawlerConnector.ObeyMetaRobotsTags=Obey meta robots tags
 WebcrawlerConnector.Throttles=aceleradores:
 WebcrawlerConnector.BinRegularExpression=Expresión regular Bin
 WebcrawlerConnector.CaseInsensitive=mayúsculas y minúsculas?

Modified: 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties?rev=1785071&r1=1785070&r2=1785071&view=diff
==============================================================================
--- 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties
 (original)
+++ 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties
 Thu Mar  2 06:00:55 2017
@@ -29,6 +29,9 @@ WebcrawlerConnector.RobotsTxtUsage=Robot
 WebcrawlerConnector.DontLookAtRobotsTxt=robots.txtを利用しない
 WebcrawlerConnector.ObeyRobotsTxtForDataFetchesOnly=データ取得のå 
´åˆã®ã¿ã«robots.txtに従う
 WebcrawlerConnector.ObeyRobotsTxtForAllFetches=すべてrobots.txtに従う
+WebcrawlerConnector.MetaRobotsTagsUsage=Meta robots tags usage:
+WebcrawlerConnector.DontLookAtMetaRobotsTags=Don't look at meta robots tags
+WebcrawlerConnector.ObeyMetaRobotsTags=Obey meta robots tags
 WebcrawlerConnector.Throttles=スロットル:
 WebcrawlerConnector.BinRegularExpression=Bin正規表現
 WebcrawlerConnector.CaseInsensitive=大/小文字を区別しない

Modified: 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties?rev=1785071&r1=1785070&r2=1785071&view=diff
==============================================================================
--- 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties
 (original)
+++ 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties
 Thu Mar  2 06:00:55 2017
@@ -29,6 +29,9 @@ WebcrawlerConnector.RobotsTxtUsage=�
 WebcrawlerConnector.DontLookAtRobotsTxt=不使用robots.txt
 
WebcrawlerConnector.ObeyRobotsTxtForDataFetchesOnly=只在提取数据时服从robots.txt
 WebcrawlerConnector.ObeyRobotsTxtForAllFetches=每次提取均服从robots.txt
+WebcrawlerConnector.MetaRobotsTagsUsage=Meta robots tags usage:
+WebcrawlerConnector.DontLookAtMetaRobotsTags=Don't look at meta robots tags
+WebcrawlerConnector.ObeyMetaRobotsTags=Obey meta robots tags
 WebcrawlerConnector.Throttles=限流器: 
 WebcrawlerConnector.BinRegularExpression=Bin正则表达式
 WebcrawlerConnector.CaseInsensitive=不区分大小写

Added: 
manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ProcessActivityHTMLHandlerTest.java
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ProcessActivityHTMLHandlerTest.java?rev=1785071&view=auto
==============================================================================
--- 
manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ProcessActivityHTMLHandlerTest.java
 (added)
+++ 
manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ProcessActivityHTMLHandlerTest.java
 Thu Mar  2 06:00:55 2017
@@ -0,0 +1,49 @@
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import static org.mockito.Mockito.mock;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
+import 
org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConnector.DocumentURLFilter;
+import 
org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConnector.ProcessActivityHTMLHandler;
+import org.apache.manifoldcf.crawler.interfaces.IProcessActivity;
+import org.junit.Before;
+import org.junit.Test;
+
+public class ProcessActivityHTMLHandlerTest {
+
+  private WebcrawlerConnector webcrawler = new WebcrawlerConnector();
+
+  private Map<String, String> metaRobotsNoindexNofollow = new HashMap<>();
+
+  @Before
+  public void setup() {
+    metaRobotsNoindexNofollow.put("name", "robots");
+    metaRobotsNoindexNofollow.put("content", "noindex,nofollow");
+  }
+
+  @Test
+  public void testNoteMetaTag_robotsInstructionsAreObeyed() throws 
ManifoldCFException {
+    IProcessActivity mockActivity = mock(IProcessActivity.class);
+    DocumentURLFilter filter = mock(DocumentURLFilter.class);
+    ProcessActivityHTMLHandler sut = webcrawler.new 
ProcessActivityHTMLHandler("id", mockActivity, filter, 
WebcrawlerConnector.META_ROBOTS_ALL);
+    sut.noteMetaTag(metaRobotsNoindexNofollow);
+    assertFalse(sut.allowIndex);
+    assertFalse(sut.allowFollow);
+  }
+
+  @Test
+  public void testNoteMetaTag_robotsInstructionsCanBeIgnored() throws 
ManifoldCFException {
+    IProcessActivity mockActivity = mock(IProcessActivity.class);
+    DocumentURLFilter filter = mock(DocumentURLFilter.class);
+    ProcessActivityHTMLHandler sut = webcrawler.new 
ProcessActivityHTMLHandler("id", mockActivity, filter, 
WebcrawlerConnector.META_ROBOTS_NONE);
+    sut.noteMetaTag(metaRobotsNoindexNofollow);
+    assertTrue(sut.allowIndex);
+    assertTrue(sut.allowFollow);
+  }
+
+}

Propchange: 
manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ProcessActivityHTMLHandlerTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: 
manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/tests/NavigationHSQLDBUI.java
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/tests/NavigationHSQLDBUI.java?rev=1785071&r1=1785070&r2=1785071&view=diff
==============================================================================
--- 
manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/tests/NavigationHSQLDBUI.java
 (original)
+++ 
manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/tests/NavigationHSQLDBUI.java
 Thu Mar  2 06:00:55 2017
@@ -134,6 +134,8 @@ public class NavigationHSQLDBUI extends
     form = 
window.findForm(testerInstance.createStringDescription("editconnection"));
     selectbox = 
form.findSelectbox(testerInstance.createStringDescription("robotsusage"));
     selectbox.selectValue(testerInstance.createStringDescription("none"));
+    selectbox = 
form.findSelectbox(testerInstance.createStringDescription("metarobotstagsusage"));
+    selectbox.selectValue(testerInstance.createStringDescription("none"));
     // Bandwidth
     link = window.findLink(testerInstance.createStringDescription("Bandwidth 
tab"));
     link.click();

Modified: manifoldcf/trunk/connectors/webcrawler/pom.xml
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/pom.xml?rev=1785071&r1=1785070&r2=1785071&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/pom.xml (original)
+++ manifoldcf/trunk/connectors/webcrawler/pom.xml Thu Mar  2 06:00:55 2017
@@ -233,6 +233,12 @@
       <scope>test</scope>
     </dependency>
     <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-core</artifactId>
+      <version>${mockito.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
       <groupId>${project.groupId}</groupId>
       <artifactId>mcf-core</artifactId>
       <version>${project.version}</version>

Modified: 
manifoldcf/trunk/site/src/documentation/content/xdocs/en_US/end-user-documentation.xml
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/site/src/documentation/content/xdocs/en_US/end-user-documentation.xml?rev=1785071&r1=1785070&r2=1785071&view=diff
==============================================================================
--- 
manifoldcf/trunk/site/src/documentation/content/xdocs/en_US/end-user-documentation.xml
 (original)
+++ 
manifoldcf/trunk/site/src/documentation/content/xdocs/en_US/end-user-documentation.xml
 Thu Mar  2 06:00:55 2017
@@ -1,4 +1,4 @@
-<?xml version="1.0"?>
+<?xml version="1.0"?>
 
 <!--
  Licensed to the Apache Software Foundation (ASF) under one or more
@@ -2729,7 +2729,7 @@ curl -XGET http://localhost:9200/index/_
                 <br/><br/>
                 <figure src="images/en_US/web-configure-robots.PNG" alt="Web 
Connection, Robots tab" width="80%"/>
                 <br/><br/>
-                <p>Select how the connection will interpret robots.txt.  
Remember that you have an interest in crawling people's sites as politely as is 
possible.</p>
+                <p>Select how the connection will interpret robots.txt and 
&lt;meta name=&quot;robots ...&gt; tags on HTML pages.  Remember that you have 
an interest in crawling people's sites as politely as is possible.</p>
                 <p>The "Bandwidth" tab allows you to specify a list of 
bandwidth rules.  Each rule has a regular expression matched against a URL's 
throttle bin.
                        Throttle bins, in connections of the Web type, are 
simply the server name part of the URL.  Each rule allows you to select a 
maximum bandwidth, number of
                        connections, and fetch rate.  You can have as many 
rules as you like; if a URL matches more than one rule, then the most 
conservative value will be used.</p>

Modified: 
manifoldcf/trunk/site/src/documentation/resources/images/en_US/web-configure-robots.PNG
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/site/src/documentation/resources/images/en_US/web-configure-robots.PNG?rev=1785071&r1=1785070&r2=1785071&view=diff
==============================================================================
Binary files - no diff available.


Reply via email to