Author: schuch
Date: Thu Mar 2 06:00:55 2017
New Revision: 1785071
URL: http://svn.apache.org/viewvc?rev=1785071&view=rev
Log:
CONNECTORS-1392: Adds option for webcrawler to ignore meta robots
Added:
manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ProcessActivityHTMLHandlerTest.java
(with props)
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_es_ES.properties
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties
manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/tests/NavigationHSQLDBUI.java
manifoldcf/trunk/connectors/webcrawler/pom.xml
manifoldcf/trunk/site/src/documentation/content/xdocs/en_US/end-user-documentation.xml
manifoldcf/trunk/site/src/documentation/resources/images/en_US/web-configure-robots.PNG
Modified: manifoldcf/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1785071&r1=1785070&r2=1785071&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Thu Mar 2 06:00:55 2017
@@ -3,6 +3,10 @@ $Id$
======================= 2.7-dev =====================
+CONNECTORS-1392: The Webcrawler connector now has an option to ignore
+meta robots tags in HTML pages
+(Markus Schuch)
+
CONNECTORS-1390: Add user-settable connect timeout field for
Active Directory authority.
(Cihad Guzel, Karl Wright)
Modified:
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java?rev=1785071&r1=1785070&r2=1785071&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
(original)
+++
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
Thu Mar 2 06:00:55 2017
@@ -44,6 +44,8 @@ public class WebcrawlerConfig
/** Robots usage (a parameter) */
public static final String PARAMETER_ROBOTSUSAGE = "Robots usage";
+ /** Meta robots tags usage (a parameter) */
+ public static final String PARAMETER_META_ROBOTS_TAGS_USAGE = "Meta robots
tags usage";
/** Email (a parameter) */
public static final String PARAMETER_EMAIL = "Email address";
/** Proxy host name (parameter) */
Modified:
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1785071&r1=1785070&r2=1785071&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
(original)
+++
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Thu Mar 2 06:00:55 2017
@@ -123,6 +123,9 @@ public class WebcrawlerConnector extends
protected static final int ROBOTS_DATA = 1;
protected static final int ROBOTS_ALL = 2;
+ protected static final int META_ROBOTS_NONE = 0;
+ protected static final int META_ROBOTS_ALL = 1;
+
// Relationship types
public final static String REL_LINK = "link";
public final static String REL_REDIRECT = "redirect";
@@ -162,6 +165,8 @@ public class WebcrawlerConnector extends
/** Robots usage flag */
protected int robotsUsage = ROBOTS_ALL;
+ /** Meta robots tag usage flag */
+ protected int metaRobotsTagsUsage = META_ROBOTS_ALL;
/** The user-agent for this connector instance */
protected String userAgent = null;
/** The email address for this connector instance */
@@ -364,8 +369,6 @@ public class WebcrawlerConnector extends
// Handle everything else
if (!isInitialized)
{
- String x;
-
// Either set this from the connection name, or just have one. Right
now, we have one.
String throttleGroupName = "";
@@ -375,15 +378,21 @@ public class WebcrawlerConnector extends
userAgent = "Mozilla/5.0 (ApacheManifoldCFWebCrawler; "+emailAddress+")";
from = emailAddress;
- x = params.getParameter(WebcrawlerConfig.PARAMETER_ROBOTSUSAGE);
+ String robotsTxt =
params.getParameter(WebcrawlerConfig.PARAMETER_ROBOTSUSAGE);
robotsUsage = ROBOTS_ALL;
- if (x == null || x.length() == 0 || x.equals("all"))
+ if (robotsTxt == null || robotsTxt.length() == 0 ||
robotsTxt.equals("all"))
robotsUsage = ROBOTS_ALL;
- else if (x.equals("none"))
+ else if (robotsTxt.equals("none"))
robotsUsage = ROBOTS_NONE;
- else if (x.equals("data"))
+ else if (robotsTxt.equals("data"))
robotsUsage = ROBOTS_DATA;
+ String metaRobots =
params.getParameter(WebcrawlerConfig.PARAMETER_META_ROBOTS_TAGS_USAGE);
+ if (metaRobots == null || metaRobots.length() == 0 ||
metaRobots.equals("all"))
+ metaRobotsTagsUsage = META_ROBOTS_ALL;
+ else if (metaRobots.equals("none"))
+ metaRobotsTagsUsage = META_ROBOTS_NONE;
+
throttleDescription = new ThrottleDescription(params);
credentialsDescription = new CredentialsDescription(params);
trustsDescription = new TrustsDescription(params);
@@ -1889,6 +1898,9 @@ public class WebcrawlerConnector extends
String robotsUsage =
parameters.getParameter(WebcrawlerConfig.PARAMETER_ROBOTSUSAGE);
if (robotsUsage == null)
robotsUsage = "all";
+ String metaRobotsTagsUsage =
parameters.getParameter(WebcrawlerConfig.PARAMETER_META_ROBOTS_TAGS_USAGE);
+ if (metaRobotsTagsUsage == null)
+ metaRobotsTagsUsage = "all";
String proxyHost =
parameters.getParameter(WebcrawlerConfig.PARAMETER_PROXYHOST);
if (proxyHost == null)
proxyHost = "";
@@ -1985,13 +1997,23 @@ public class WebcrawlerConnector extends
" </select>\n"+
" </td>\n"+
" </tr>\n"+
+" <tr>\n"+
+" <td class=\"description\"><nobr>" +
Messages.getBodyString(locale,"WebcrawlerConnector.MetaRobotsTagsUsage") +
"</nobr></td>\n"+
+" <td class=\"value\">\n"+
+" <select name=\"metarobotstagsusage\" size=\"3\">\n"+
+" <option value=\"none\"
"+(metaRobotsTagsUsage.equals("none")?"selected=\"selected\"":"")+">" +
Messages.getBodyString(locale,"WebcrawlerConnector.DontLookAtMetaRobotsTags") +
"</option>\n"+
+" <option value=\"all\"
"+(metaRobotsTagsUsage.equals("all")?"selected=\"selected\"":"")+">" +
Messages.getBodyString(locale,"WebcrawlerConnector.ObeyMetaRobotsTags") +
"</option>\n"+
+" </select>\n"+
+" </td>\n"+
+" </tr>\n"+
"</table>\n"
);
}
else
{
out.print(
-"<input type=\"hidden\" name=\"robotsusage\" value=\""+robotsUsage+"\"/>\n"
+"<input type=\"hidden\" name=\"robotsusage\" value=\""+robotsUsage+"\"/>\n"+
+"<input type=\"hidden\" name=\"metarobotstagsusage\"
value=\""+metaRobotsTagsUsage+"\"/>\n"
);
}
@@ -2883,6 +2905,9 @@ public class WebcrawlerConnector extends
String robotsUsage = variableContext.getParameter("robotsusage");
if (robotsUsage != null)
parameters.setParameter(WebcrawlerConfig.PARAMETER_ROBOTSUSAGE,robotsUsage);
+ String obeyMetaRobotsTags =
variableContext.getParameter("metarobotstagsusage");
+ if (obeyMetaRobotsTags != null)
+
parameters.setParameter(WebcrawlerConfig.PARAMETER_META_ROBOTS_TAGS_USAGE,
obeyMetaRobotsTags);
String proxyHost = variableContext.getParameter("proxyhost");
if (proxyHost != null)
parameters.setParameter(WebcrawlerConfig.PARAMETER_PROXYHOST,proxyHost);
@@ -3277,11 +3302,16 @@ public class WebcrawlerConnector extends
String email = parameters.getParameter(WebcrawlerConfig.PARAMETER_EMAIL);
String robots =
parameters.getParameter(WebcrawlerConfig.PARAMETER_ROBOTSUSAGE);
if (robots.equals("none"))
- robots = "Ignore robots.txt";
+ robots =
Messages.getBodyString(locale,"WebcrawlerConnector.DontLookAtRobotsTxt");
else if (robots.equals("data"))
- robots = "Obey robots.txt for data fetches only";
+ robots =
Messages.getBodyString(locale,"WebcrawlerConnector.ObeyRobotsTxtForDataFetchesOnly");
else if (robots.equals("all"))
- robots = "Obey robots.txt for all fetches";
+ robots =
Messages.getBodyString(locale,"WebcrawlerConnector.ObeyRobotsTxtForAllFetches");
+ String metaRobotsTagsUsage =
parameters.getParameter(WebcrawlerConfig.PARAMETER_META_ROBOTS_TAGS_USAGE);
+ if (metaRobotsTagsUsage == null || metaRobotsTagsUsage.equals("all"))
+ metaRobotsTagsUsage =
Messages.getBodyString(locale,"WebcrawlerConnector.ObeyMetaRobotsTags");
+ else if (metaRobotsTagsUsage.equals("none"))
+ metaRobotsTagsUsage =
Messages.getBodyString(locale,"WebcrawlerConnector.DontLookAtMetaRobotsTags");
String proxyHost =
parameters.getParameter(WebcrawlerConfig.PARAMETER_PROXYHOST);
if (proxyHost == null)
proxyHost = "";
@@ -3299,9 +3329,13 @@ public class WebcrawlerConnector extends
"<table class=\"displaytable\">\n"+
" <tr>\n"+
" <td class=\"description\"
colspan=\"1\"><nobr>"+Messages.getBodyString(locale,"WebcrawlerConnector.EmailAddress")+"</nobr></td>\n"+
-" <td class=\"value\" colspan=\"1\">"+Encoder.bodyEscape(email)+"</td>\n"+
+" <td class=\"value\" colspan=\"3\">"+Encoder.bodyEscape(email)+"</td>\n"+
+" </tr>\n"+
+" <tr>\n"+
" <td class=\"description\"
colspan=\"1\"><nobr>"+Messages.getBodyString(locale,"WebcrawlerConnector.RobotsUsage")+"</nobr></td>\n"+
" <td class=\"value\"
colspan=\"1\"><nobr>"+Encoder.bodyEscape(robots)+"</nobr></td>\n"+
+" <td class=\"description\"
colspan=\"1\"><nobr>"+Messages.getBodyString(locale,"WebcrawlerConnector.MetaRobotsTagsUsage")+"</nobr></td>\n"+
+" <td class=\"value\"
colspan=\"1\">"+Encoder.bodyEscape(metaRobotsTagsUsage)+"</td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" +
Messages.getBodyString(locale,"WebcrawlerConnector.ProxyHostColon") +
"</nobr></td>\n"+
@@ -6048,7 +6082,7 @@ public class WebcrawlerConnector extends
if (Logging.connectors.isDebugEnabled() && redirectHandler.shouldIndex()
== false)
Logging.connectors.debug("Web: Not indexing document
'"+documentIdentifier+"' because of redirection");
// For html, we don't want any actions, because we don't do form
submission.
- ProcessActivityHTMLHandler htmlHandler = new
ProcessActivityHTMLHandler(documentIdentifier,activities,filter);
+ ProcessActivityHTMLHandler htmlHandler = new
ProcessActivityHTMLHandler(documentIdentifier,activities,filter,metaRobotsTagsUsage);
handleHTML(documentIdentifier,htmlHandler);
if (Logging.connectors.isDebugEnabled() && htmlHandler.shouldIndex() ==
false)
Logging.connectors.debug("Web: Not indexing document
'"+documentIdentifier+"' because of HTML robots or content tags prohibiting
indexing");
@@ -6124,11 +6158,13 @@ public class WebcrawlerConnector extends
{
boolean allowIndex = true;
boolean allowFollow = true;
+ boolean obeyMetaRobotsTags = true;
/** Constructor. */
- public ProcessActivityHTMLHandler(String documentIdentifier,
IProcessActivity activities, DocumentURLFilter filter)
+ public ProcessActivityHTMLHandler(String documentIdentifier,
IProcessActivity activities, DocumentURLFilter filter, int metaRobotTagsUsage)
{
super(documentIdentifier,activities,filter,"html",REL_LINK);
+ this.obeyMetaRobotsTags = metaRobotTagsUsage == META_ROBOTS_ALL;
}
/** Decide whether we should index. */
@@ -6152,7 +6188,7 @@ public class WebcrawlerConnector extends
throws ManifoldCFException
{
String name = (String)metaAttributes.get("name");
- if (name != null && name.toLowerCase(Locale.ROOT).equals("robots"))
+ if (obeyMetaRobotsTags && name != null &&
name.toLowerCase(Locale.ROOT).equals("robots"))
{
String contentValue = (String)metaAttributes.get("content");
if (contentValue != null)
Modified:
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties?rev=1785071&r1=1785070&r2=1785071&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties
(original)
+++
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties
Thu Mar 2 06:00:55 2017
@@ -29,6 +29,9 @@ WebcrawlerConnector.RobotsTxtUsage=Robot
WebcrawlerConnector.DontLookAtRobotsTxt=Don't look at robots.txt
WebcrawlerConnector.ObeyRobotsTxtForDataFetchesOnly=Obey robots.txt for data
fetches only
WebcrawlerConnector.ObeyRobotsTxtForAllFetches=Obey robots.txt for all fetches
+WebcrawlerConnector.MetaRobotsTagsUsage=Meta robots tags usage:
+WebcrawlerConnector.DontLookAtMetaRobotsTags=Don't look at meta robots tags
+WebcrawlerConnector.ObeyMetaRobotsTags=Obey meta robots tags
WebcrawlerConnector.Throttles=Throttles:
WebcrawlerConnector.BinRegularExpression=Bin regular expression
WebcrawlerConnector.CaseInsensitive=Case insensitive?
Modified:
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_es_ES.properties
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_es_ES.properties?rev=1785071&r1=1785070&r2=1785071&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_es_ES.properties
(original)
+++
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_es_ES.properties
Thu Mar 2 06:00:55 2017
@@ -29,6 +29,9 @@ WebcrawlerConnector.RobotsTxtUsage=uso r
WebcrawlerConnector.DontLookAtRobotsTxt=No mire robots.txt
WebcrawlerConnector.ObeyRobotsTxtForDataFetchesOnly=Obedecer robots.txt para
datos obtiene solamente
WebcrawlerConnector.ObeyRobotsTxtForAllFetches=Obedecer robots.txt para todas
las recuperaciones
+WebcrawlerConnector.MetaRobotsTagsUsage=Meta robots tags usage:
+WebcrawlerConnector.DontLookAtMetaRobotsTags=Don't look at meta robots tags
+WebcrawlerConnector.ObeyMetaRobotsTags=Obey meta robots tags
WebcrawlerConnector.Throttles=aceleradores:
WebcrawlerConnector.BinRegularExpression=Expresión regular Bin
WebcrawlerConnector.CaseInsensitive=mayúsculas y minúsculas?
Modified:
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties?rev=1785071&r1=1785070&r2=1785071&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties
(original)
+++
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties
Thu Mar 2 06:00:55 2017
@@ -29,6 +29,9 @@ WebcrawlerConnector.RobotsTxtUsage=Robot
WebcrawlerConnector.DontLookAtRobotsTxt=robots.txtãå©ç¨ããªã
WebcrawlerConnector.ObeyRobotsTxtForDataFetchesOnly=ãã¼ã¿åå¾ã®å
´åã®ã¿ã«robots.txtã«å¾ã
WebcrawlerConnector.ObeyRobotsTxtForAllFetches=ãã¹ã¦robots.txtã«å¾ã
+WebcrawlerConnector.MetaRobotsTagsUsage=Meta robots tags usage:
+WebcrawlerConnector.DontLookAtMetaRobotsTags=Don't look at meta robots tags
+WebcrawlerConnector.ObeyMetaRobotsTags=Obey meta robots tags
WebcrawlerConnector.Throttles=ã¹ãããã«ï¼
WebcrawlerConnector.BinRegularExpression=Binæ£è¦è¡¨ç¾
WebcrawlerConnector.CaseInsensitive=大ï¼å°æåãåºå¥ããªã
Modified:
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties?rev=1785071&r1=1785070&r2=1785071&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties
(original)
+++
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties
Thu Mar 2 06:00:55 2017
@@ -29,6 +29,9 @@ WebcrawlerConnector.RobotsTxtUsage=�
WebcrawlerConnector.DontLookAtRobotsTxt=ä¸ä½¿ç¨robots.txt
WebcrawlerConnector.ObeyRobotsTxtForDataFetchesOnly=åªå¨æåæ°æ®æ¶æä»robots.txt
WebcrawlerConnector.ObeyRobotsTxtForAllFetches=æ¯æ¬¡æååæä»robots.txt
+WebcrawlerConnector.MetaRobotsTagsUsage=Meta robots tags usage:
+WebcrawlerConnector.DontLookAtMetaRobotsTags=Don't look at meta robots tags
+WebcrawlerConnector.ObeyMetaRobotsTags=Obey meta robots tags
WebcrawlerConnector.Throttles=éæµå¨:
WebcrawlerConnector.BinRegularExpression=Binæ£å表达å¼
WebcrawlerConnector.CaseInsensitive=ä¸åºå大å°å
Added:
manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ProcessActivityHTMLHandlerTest.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ProcessActivityHTMLHandlerTest.java?rev=1785071&view=auto
==============================================================================
---
manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ProcessActivityHTMLHandlerTest.java
(added)
+++
manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ProcessActivityHTMLHandlerTest.java
Thu Mar 2 06:00:55 2017
@@ -0,0 +1,49 @@
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import static org.mockito.Mockito.mock;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
+import
org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConnector.DocumentURLFilter;
+import
org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConnector.ProcessActivityHTMLHandler;
+import org.apache.manifoldcf.crawler.interfaces.IProcessActivity;
+import org.junit.Before;
+import org.junit.Test;
+
+public class ProcessActivityHTMLHandlerTest {
+
+ private WebcrawlerConnector webcrawler = new WebcrawlerConnector();
+
+ private Map<String, String> metaRobotsNoindexNofollow = new HashMap<>();
+
+ @Before
+ public void setup() {
+ metaRobotsNoindexNofollow.put("name", "robots");
+ metaRobotsNoindexNofollow.put("content", "noindex,nofollow");
+ }
+
+ @Test
+ public void testNoteMetaTag_robotsInstructionsAreObeyed() throws
ManifoldCFException {
+ IProcessActivity mockActivity = mock(IProcessActivity.class);
+ DocumentURLFilter filter = mock(DocumentURLFilter.class);
+ ProcessActivityHTMLHandler sut = webcrawler.new
ProcessActivityHTMLHandler("id", mockActivity, filter,
WebcrawlerConnector.META_ROBOTS_ALL);
+ sut.noteMetaTag(metaRobotsNoindexNofollow);
+ assertFalse(sut.allowIndex);
+ assertFalse(sut.allowFollow);
+ }
+
+ @Test
+ public void testNoteMetaTag_robotsInstructionsCanBeIgnored() throws
ManifoldCFException {
+ IProcessActivity mockActivity = mock(IProcessActivity.class);
+ DocumentURLFilter filter = mock(DocumentURLFilter.class);
+ ProcessActivityHTMLHandler sut = webcrawler.new
ProcessActivityHTMLHandler("id", mockActivity, filter,
WebcrawlerConnector.META_ROBOTS_NONE);
+ sut.noteMetaTag(metaRobotsNoindexNofollow);
+ assertTrue(sut.allowIndex);
+ assertTrue(sut.allowFollow);
+ }
+
+}
Propchange:
manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ProcessActivityHTMLHandlerTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified:
manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/tests/NavigationHSQLDBUI.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/tests/NavigationHSQLDBUI.java?rev=1785071&r1=1785070&r2=1785071&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/tests/NavigationHSQLDBUI.java
(original)
+++
manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/tests/NavigationHSQLDBUI.java
Thu Mar 2 06:00:55 2017
@@ -134,6 +134,8 @@ public class NavigationHSQLDBUI extends
form =
window.findForm(testerInstance.createStringDescription("editconnection"));
selectbox =
form.findSelectbox(testerInstance.createStringDescription("robotsusage"));
selectbox.selectValue(testerInstance.createStringDescription("none"));
+ selectbox =
form.findSelectbox(testerInstance.createStringDescription("metarobotstagsusage"));
+ selectbox.selectValue(testerInstance.createStringDescription("none"));
// Bandwidth
link = window.findLink(testerInstance.createStringDescription("Bandwidth
tab"));
link.click();
Modified: manifoldcf/trunk/connectors/webcrawler/pom.xml
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/pom.xml?rev=1785071&r1=1785070&r2=1785071&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/pom.xml (original)
+++ manifoldcf/trunk/connectors/webcrawler/pom.xml Thu Mar 2 06:00:55 2017
@@ -233,6 +233,12 @@
<scope>test</scope>
</dependency>
<dependency>
+ <groupId>org.mockito</groupId>
+ <artifactId>mockito-core</artifactId>
+ <version>${mockito.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
<groupId>${project.groupId}</groupId>
<artifactId>mcf-core</artifactId>
<version>${project.version}</version>
Modified:
manifoldcf/trunk/site/src/documentation/content/xdocs/en_US/end-user-documentation.xml
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/site/src/documentation/content/xdocs/en_US/end-user-documentation.xml?rev=1785071&r1=1785070&r2=1785071&view=diff
==============================================================================
---
manifoldcf/trunk/site/src/documentation/content/xdocs/en_US/end-user-documentation.xml
(original)
+++
manifoldcf/trunk/site/src/documentation/content/xdocs/en_US/end-user-documentation.xml
Thu Mar 2 06:00:55 2017
@@ -1,4 +1,4 @@
-<?xml version="1.0"?>
+<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
@@ -2729,7 +2729,7 @@ curl -XGET http://localhost:9200/index/_
<br/><br/>
<figure src="images/en_US/web-configure-robots.PNG" alt="Web
Connection, Robots tab" width="80%"/>
<br/><br/>
- <p>Select how the connection will interpret robots.txt.
Remember that you have an interest in crawling people's sites as politely as is
possible.</p>
+ <p>Select how the connection will interpret robots.txt and
<meta name="robots ...> tags on HTML pages. Remember that you have
an interest in crawling people's sites as politely as is possible.</p>
<p>The "Bandwidth" tab allows you to specify a list of
bandwidth rules. Each rule has a regular expression matched against a URL's
throttle bin.
Throttle bins, in connections of the Web type, are
simply the server name part of the URL. Each rule allows you to select a
maximum bandwidth, number of
connections, and fetch rate. You can have as many
rules as you like; if a URL matches more than one rule, then the most
conservative value will be used.</p>
Modified:
manifoldcf/trunk/site/src/documentation/resources/images/en_US/web-configure-robots.PNG
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/site/src/documentation/resources/images/en_US/web-configure-robots.PNG?rev=1785071&r1=1785070&r2=1785071&view=diff
==============================================================================
Binary files - no diff available.