Author: kwright
Date: Mon Sep 15 18:18:59 2014
New Revision: 1625103
URL: http://svn.apache.org/r1625103
Log:
Revamp wiki connector. Part of CONNECTORS-977.
Modified:
manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
Modified:
manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java?rev=1625103&r1=1625102&r2=1625103&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
(original)
+++
manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
Mon Sep 15 18:18:59 2014
@@ -899,94 +899,86 @@ public class WikiConnector extends org.a
return new Long(seedTime).toString();
}
- /** Get document versions given an array of document identifiers.
- * This method is called for EVERY document that is considered. It is
- * therefore important to perform as little work as possible here.
- *@param documentIdentifiers is the array of local document identifiers, as
understood by this connector.
- *@param oldVersions is the corresponding array of version strings that have
been saved for the document identifiers.
- * A null value indicates that this is a first-time fetch, while an empty
string indicates that the previous document
- * had an empty version string.
- *@param activities is the interface this method should use to perform
whatever framework actions are desired.
- *@param spec is the current document specification for the current job. If
there is a dependency on this
- * specification, then the version string should include the pertinent data,
so that reingestion will occur
- * when the specification changes. This is primarily useful for metadata.
- *@param jobMode is an integer describing how the job is being run, whether
continuous or once-only.
- *@param usesDefaultAuthority will be true only if the authority in use for
these documents is the default one.
- *@return the corresponding version strings, with null in the places where
the document no longer exists.
- * Empty version strings indicate that there is no versioning ability for the
corresponding document, and the document
- * will always be processed.
- */
- @Override
- public String[] getDocumentVersions(String[] documentIdentifiers, String[]
oldVersions, IVersionActivity activities,
- DocumentSpecification spec, int jobMode, boolean usesDefaultAuthority)
- throws ManifoldCFException, ServiceInterruption
- {
- Map<String,String> versions = new HashMap<String,String>();
- getTimestamps(documentIdentifiers,versions,activities);
- String[] rval = new String[documentIdentifiers.length];
- for (int i = 0 ; i < rval.length ; i++)
- {
- rval[i] = versions.get(documentIdentifiers[i]);
- }
- return rval;
- }
-
/** Process a set of documents.
* This is the method that should cause each document to be fetched,
processed, and the results either added
* to the queue of documents for the current job, and/or entered into the
incremental ingestion manager.
* The document specification allows this class to filter what is done based
on the job.
+ * The connector will be connected before this method can be called.
*@param documentIdentifiers is the set of document identifiers to process.
- *@param versions is the corresponding document versions to process, as
returned by getDocumentVersions() above.
- * The implementation may choose to ignore this parameter and always
process the current version.
+ *@param statuses are the currently-stored document versions for each
document in the set of document identifiers
+ * passed in above.
*@param activities is the interface this method should use to queue up new
document references
* and ingest documents.
- *@param spec is the document specification.
- *@param scanOnly is an array corresponding to the document identifiers. It
is set to true to indicate when the processing
- * should only find other references, and should not actually call the
ingestion methods.
*@param jobMode is an integer describing how the job is being run, whether
continuous or once-only.
+ *@param usesDefaultAuthority will be true only if the authority in use for
these documents is the default one.
*/
- public void processDocuments(String[] documentIdentifiers, String[]
versions, IProcessActivity activities,
- DocumentSpecification spec, boolean[] scanOnly, int jobMode)
+ @Override
+ public void processDocuments(String[] documentIdentifiers, IExistingVersions
statuses, Specification spec,
+ IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
throws ManifoldCFException, ServiceInterruption
{
// Forced acls
String[] acls = getAcls(spec);
- Map<String,String> urls = new HashMap<String,String>();
- getDocURLs(documentIdentifiers,urls);
- for (int i = 0 ; i < documentIdentifiers.length ; i++)
+ Map<String,String> versions = new HashMap<String,String>();
+ getTimestamps(documentIdentifiers,versions,activities);
+
+ List<String> fetchDocuments = new ArrayList<String>();
+ for (String documentIdentifier : documentIdentifiers)
{
- if (!scanOnly[i])
+ String versionString = versions.get(documentIdentifier);
+ if (versionString == null)
{
- String url = urls.get(documentIdentifiers[i]);
- if (url != null)
- getDocInfo(documentIdentifiers[i], versions[i], url, activities,
acls);
+ activities.deleteDocument(documentIdentifier);
+ continue;
}
+
+ if
(!activities.checkDocumentNeedsReindexing(documentIdentifier,versionString))
+ continue;
+
+ fetchDocuments.add(documentIdentifier);
+ }
+
+ if (fetchDocuments.size() == 0)
+ return;
+
+ String[] fetchDocumentsArray = fetchDocuments.toArray(new String[0]);
+ Map<String,String> urls = new HashMap<String,String>();
+ getDocURLs(documentIdentifiers,urls);
+ for (String documentIdentifier : fetchDocumentsArray)
+ {
+ String url = urls.get(documentIdentifier);
+ String versionString = versions.get(documentIdentifier);
+ if (url != null)
+ getDocInfo(documentIdentifier, versionString, url, activities, acls);
+ else
+ activities.noDocument(documentIdentifier,versionString);
}
- }
+ }
+
/**
* Grab forced acl out of document specification.
*
* @param spec is the document specification.
* @return the acls.
*/
- protected static String[] getAcls(DocumentSpecification spec) {
- HashMap map = new HashMap();
- int i = 0;
- while (i < spec.getChildCount()) {
- SpecificationNode sn = spec.getChild(i++);
+ protected static String[] getAcls(Specification spec) {
+ Set<String> aclMap = new HashSet<String>();
+ for (int i = 0; i < spec.getChildCount(); i++)
+ {
+ SpecificationNode sn = spec.getChild(i);
if (sn.getType().equals("access")) {
String token = sn.getAttributeValue("token");
- map.put(token, token);
+ aclMap.add(token);
}
}
- String[] rval = new String[map.size()];
- Iterator iter = map.keySet().iterator();
- i = 0;
- while (iter.hasNext()) {
- rval[i++] = (String) iter.next();
+ String[] rval = new String[aclMap.size()];
+ int j = 0;
+ for (String acl : aclMap)
+ {
+ rval[j++] = acl;
}
return rval;
}
@@ -3213,10 +3205,10 @@ public class WikiConnector extends org.a
/** Thread to execute a "get timestamp" operation. This thread both
executes the operation and parses the result. */
protected static class ExecuteGetTimestampThread extends Thread
{
- protected HttpClient client;
- protected HttpRequestBase executeMethod;
+ protected final HttpClient client;
+ protected final HttpRequestBase executeMethod;
protected Throwable exception = null;
- protected Map<String,String> versions;
+ protected final Map<String,String> versions;
protected boolean loginNeeded = false;
public ExecuteGetTimestampThread(HttpClient client, HttpRequestBase
executeMethod, Map<String,String> versions)