Author: kwright
Date: Thu Jun 13 15:55:55 2013
New Revision: 1492719
URL: http://svn.apache.org/r1492719
Log:
Add logging output describing why document rejected. Part of CONNECTORS-715.
Modified:
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Modified:
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1492719&r1=1492718&r2=1492719&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
(original)
+++
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Thu Jun 13 15:55:55 2013
@@ -5604,7 +5604,10 @@ public class WebcrawlerConnector extends
if (interestingMimeTypeMap.get(contentType) != null)
return true;
- return activities.checkMimeTypeIndexable(contentType);
+ boolean rval = activities.checkMimeTypeIndexable(contentType);
+ if (rval == false && Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("Web: For document '"+documentIdentifier+"',
not fetching because output connector does not want mimetype
'"+contentType+"'");
+ return rval;
}
/** Code to check if an already-fetched document should be ingested.
@@ -5616,13 +5619,25 @@ public class WebcrawlerConnector extends
return false;
if
(activities.checkLengthIndexable(cache.getDataLength(documentIdentifier)) ==
false)
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("Web: For document '"+documentIdentifier+"',
not indexing because output connector thinks length
"+cache.getDataLength(documentIdentifier)+" is too long");
return false;
-
+ }
+
if (activities.checkURLIndexable(documentIdentifier) == false)
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("Web: For document '"+documentIdentifier+"',
not indexing because output connector does not want URL");
return false;
+ }
if (filter.isDocumentIndexable(documentIdentifier) == false)
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("Web: For document '"+documentIdentifier+"',
not indexing because document does not match web job constraints");
return false;
+ }
// Check if it's a recognized content type
String contentType = cache.getContentType(documentIdentifier);
@@ -5645,7 +5660,10 @@ public class WebcrawlerConnector extends
contentType = contentType.substring(0,pos);
contentType = contentType.trim();
- return activities.checkMimeTypeIndexable(contentType);
+ boolean rval = activities.checkMimeTypeIndexable(contentType);
+ if (rval == false && Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("Web: For document '"+documentIdentifier+"',
not indexing because output connector does not want mime type
'"+contentType+"'");
+ return rval;
}
/** Find a redirection URI, if it exists */