Author: kwright
Date: Thu Jun 13 17:11:23 2013
New Revision: 1492767
URL: http://svn.apache.org/r1492767
Log:
Add more logging for indexing prohibition. Part of CONNECTORS-715.
Modified:
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Modified:
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1492767&r1=1492766&r2=1492767&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
(original)
+++
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Thu Jun 13 17:11:23 2013
@@ -5738,11 +5738,17 @@ public class WebcrawlerConnector extends
{
ProcessActivityRedirectionHandler redirectHandler = new
ProcessActivityRedirectionHandler(documentIdentifier,activities,filter);
handleRedirects(documentIdentifier,redirectHandler);
+ if (Logging.connectors.isDebugEnabled() && redirectHandler.shouldIndex()
== false)
+ Logging.connectors.debug("Web: Not indexing document
'"+documentIdentifier+"' because of redirection");
// For html, we don't want any actions, because we don't do form
submission.
ProcessActivityHTMLHandler htmlHandler = new
ProcessActivityHTMLHandler(documentIdentifier,activities,filter);
handleHTML(documentIdentifier,htmlHandler);
+ if (Logging.connectors.isDebugEnabled() && htmlHandler.shouldIndex() ==
false)
+ Logging.connectors.debug("Web: Not indexing document
'"+documentIdentifier+"' because of HTML robots or content tags prohibiting
indexing");
ProcessActivityXMLHandler xmlHandler = new
ProcessActivityXMLHandler(documentIdentifier,activities,filter);
handleXML(documentIdentifier,xmlHandler);
+ if (Logging.connectors.isDebugEnabled() && xmlHandler.shouldIndex() ==
false)
+ Logging.connectors.debug("Web: Not indexing document
'"+documentIdentifier+"' because of XML robots or content tags prohibiting
indexing");
// May add more later for other extraction tasks.
return htmlHandler.shouldIndex() && redirectHandler.shouldIndex() &&
xmlHandler.shouldIndex();
}