Author: kwright
Date: Wed Sep 25 08:05:09 2013
New Revision: 1526124
URL: http://svn.apache.org/r1526124
Log:
Fix for CONNECTORS-782.
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java
Modified: manifoldcf/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1526124&r1=1526123&r2=1526124&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Wed Sep 25 08:05:09 2013
@@ -3,6 +3,9 @@ $Id$
======================= 1.4-dev =====================
+CONNECTORS-782: Add unique-ID metadata in SharePoint connector.
+(Dmitry Goldenberg, Karl Wright)
+
CONNECTORS-778: Add support for attachments in SharePoint
connector.
(Dmitry Goldenberg, Karl Wright)
Modified:
manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java?rev=1526124&r1=1526123&r2=1526124&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java
(original)
+++
manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java
Wed Sep 25 08:05:09 2013
@@ -765,8 +765,6 @@ public class SharePointRepository extend
String[] denyTokens =
activities.retrieveParentData(documentIdentifier, "denyTokens");
String[] listIDs =
activities.retrieveParentData(documentIdentifier, "guids");
String[] listFields =
activities.retrieveParentData(documentIdentifier, "fields");
- // Grab the ID from the carrydown data; it's needed to find
the attachments.
- String[] ids =
activities.retrieveParentData(documentIdentifier, "ids");
String listID;
if (listIDs.length >= 1)
@@ -774,13 +772,7 @@ public class SharePointRepository extend
else
listID = null;
- String id;
- if (ids.length >= 1)
- id = ids[0];
- else
- id = null;
-
- if (listID != null && id != null)
+ if (listID != null)
{
String[] sortedMetadataFields =
getInterestingFieldSetSorted(metadataInfo,listFields);
@@ -792,11 +784,15 @@ public class SharePointRepository extend
ArrayList metadataDescription = new ArrayList();
metadataDescription.add("Modified");
metadataDescription.add("Created");
+ metadataDescription.add("ID");
+ metadataDescription.add("GUID");
// The document path includes the library, with no leading
slash, and is decoded.
String decodedItemPathWithoutSite =
decodedItemPath.substring(cutoff+1);
Map<String,String> values = proxy.getFieldValues(
metadataDescription, encodedSitePath, listID, "/Lists/" +
decodedItemPathWithoutSite, dspStsWorks );
String modifiedDate = values.get("Modified");
String createdDate = values.get("Created");
+ String id = values.get("ID");
+ String guid = values.get("GUID");
if (modifiedDate != null)
{
// Item has a modified date so we presume it exists.
@@ -817,6 +813,7 @@ public class SharePointRepository extend
packDate(sb,modifiedDateValue);
packDate(sb,createdDateValue);
pack(sb,id,'+');
+ pack(sb,guid,'+');
// The rest of this is unparseable
sb.append(versionToken);
sb.append(pathNameAttributeVersion);
@@ -999,6 +996,7 @@ public class SharePointRepository extend
metadataDescription.add("Last_x0020_Modified");
metadataDescription.add("Modified");
metadataDescription.add("Created");
+ metadataDescription.add("GUID");
// The document path includes the library, with no leading
slash, and is decoded.
int cutoff = decodedLibPath.lastIndexOf("/");
String decodedDocumentPathWithoutSite =
decodedDocumentPath.substring(cutoff);
@@ -1006,8 +1004,9 @@ public class SharePointRepository extend
String modifiedDate = values.get("Modified");
String createdDate = values.get("Created");
-
+ String guid = values.get("GUID");
String modifyDate = values.get("Last_x0020_Modified");
+
if (modifyDate != null)
{
// Item has a modified date, so we presume it exists
@@ -1051,6 +1050,7 @@ public class SharePointRepository extend
packList(sb,denyTokens,'+');
packDate(sb,modifiedDateValue);
packDate(sb,createdDateValue);
+ pack(sb,guid,'+');
// The rest of this is unparseable
sb.append(versionToken);
sb.append(pathNameAttributeVersion);
@@ -1175,6 +1175,8 @@ public class SharePointRepository extend
return sortedMetadataFields;
}
+ protected static final String[] attachmentDataNames = new
String[]{"createdDate","modifiedDate","accessTokens","denyTokens","url","guids"};
+
/** Process a set of documents.
* This is the method that should cause each document to be fetched,
processed, and the results either added
* to the queue of documents for the current job, and/or entered into the
incremental ingestion manager.
@@ -1346,6 +1348,11 @@ public class SharePointRepository extend
StringBuilder idBuffer = new StringBuilder();
startPosition = unpack(idBuffer,version,startPosition,'+');
+ // List item GUID (for metadata)
+ StringBuilder guidBuffer = new StringBuilder();
+ startPosition = unpack(guidBuffer,version,startPosition,'+');
+ String guid = guidBuffer.toString();
+
// We need the list ID, which we've already fetched, so grab
that from the parent data.
String[] listIDs =
activities.retrieveParentData(documentIdentifier, "guids");
@@ -1378,8 +1385,7 @@ public class SharePointRepository extend
// we unpacked the version information early above.
// No check for inclusion; if the list item is included, so
is this
- String[] dataNames = new
String[]{"createdDate","modifiedDate","accessTokens","denyTokens","url"};
- String[][] dataValues = new String[5][];
+ String[][] dataValues = new
String[attachmentDataNames.length][];
if (createdDate == null)
dataValues[0] = new String[0];
else
@@ -1397,9 +1403,10 @@ public class SharePointRepository extend
else
dataValues[3] = (String[])denyAcls.toArray(new String[0]);
dataValues[4] = new String[]{attachmentName.getPrettyName()};
+ dataValues[5] = new String[]{guid};
activities.addDocumentReference(documentIdentifier + "/" +
attachmentName.getValue(),
- documentIdentifier, null, dataNames, dataValues);
+ documentIdentifier, null, attachmentDataNames, dataValues);
}
}
@@ -1458,7 +1465,8 @@ public class SharePointRepository extend
data.addField(fieldName,fieldData);
}
}
-
+ data.addField("GUID",guid);
+
activities.ingestDocument( documentIdentifier, version,
itemUrl , data );
}
finally
@@ -1506,18 +1514,41 @@ public class SharePointRepository extend
if (createdDate.getTime() == 0L)
createdDate = null;
- // Fetch and index. This also filters documents based on
output connector restrictions.
- String fileUrl = serverUrl + encodePath(urlBuffer.toString());
- String fetchUrl = fileUrl;
- if (!fetchAndIndexFile(activities, documentIdentifier,
version, fileUrl, fetchUrl,
- accessTokens, denyTokens, createdDate, modifiedDate, null,
sDesc))
+ // We need the list ID, which we've already fetched, so grab
that from the parent data.
+ String[] guids =
activities.retrieveParentData(documentIdentifier, "guids");
+ String guid;
+ if (guids.length >= 1)
+ guid = guids[0];
+ else
+ guid = null;
+
+ if (guid != null)
+ {
+ String url = urlBuffer.toString();
+ int lastIndex = url.lastIndexOf("/");
+ guid = guid + ":" + url.substring(lastIndex+1);
+
+ // Fetch and index. This also filters documents based on
output connector restrictions.
+ String fileUrl = serverUrl + encodePath(url);
+ String fetchUrl = fileUrl;
+ if (!fetchAndIndexFile(activities, documentIdentifier,
version, fileUrl, fetchUrl,
+ accessTokens, denyTokens, createdDate, modifiedDate, null,
guid, sDesc))
+ {
+ // Document not indexed for whatever reason
+ activities.deleteDocument(documentIdentifier,version);
+ i++;
+ continue;
+ }
+ }
+ else
{
- // Document not indexed for whatever reason
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("SharePoint: Skipping attachment
'"+documentIdentifier+"' because no parent guid found");
activities.deleteDocument(documentIdentifier,version);
i++;
continue;
}
-
+
}
}
}
@@ -1644,6 +1675,11 @@ public class SharePointRepository extend
if (createdDate.getTime() == 0L)
createdDate = null;
+ // Document GUID (for metadata)
+ StringBuilder guidBuffer = new StringBuilder();
+ startPosition = unpack(guidBuffer,version,startPosition,'+');
+ String guid = guidBuffer.toString();
+
// Generate the URL we are going to use
String fileUrl = fileBaseUrl + encodedDocumentPath;
if (Logging.connectors.isDebugEnabled())
@@ -1686,7 +1722,7 @@ public class SharePointRepository extend
// Fetch and index. This also filters documents based on output
connector restrictions.
if (!fetchAndIndexFile(activities, documentIdentifier, version,
fileUrl, serverUrl + encodedServerLocation + encodedDocumentPath,
- acls, denyAcls, createdDate, modifiedDate, metadataValues,
sDesc))
+ acls, denyAcls, createdDate, modifiedDate, metadataValues,
guid, sDesc))
{
// Document not indexed for whatever reason
activities.deleteDocument(documentIdentifier,version);
@@ -1782,7 +1818,7 @@ public class SharePointRepository extend
*/
protected boolean fetchAndIndexFile(IProcessActivity activities, String
documentIdentifier, String version,
String fileUrl, String fetchUrl, ArrayList acls, ArrayList denyAcls, Date
createdDate, Date modifiedDate,
- Map<String,String> metadataValues, SystemMetadataDescription sDesc)
+ Map<String,String> metadataValues, String guid, SystemMetadataDescription
sDesc)
throws ManifoldCFException, ServiceInterruption
{
// Before we fetch, confirm that the output connector will accept the
document
@@ -1923,6 +1959,8 @@ public class SharePointRepository extend
data.addField(fieldName,fieldData);
}
}
+ data.addField("GUID",guid);
+
activities.ingestDocument( documentIdentifier, version,
fileUrl , data );
return true;
}
@@ -2154,7 +2192,7 @@ public class SharePointRepository extend
}
}
- protected final static String[] listItemStreamDataNames = new
String[]{"accessTokens", "denyTokens", "guids", "fields", "ids"};
+ protected final static String[] listItemStreamDataNames = new
String[]{"accessTokens", "denyTokens", "guids", "fields"};
protected class ListItemStream implements IFileStream
{
@@ -2214,26 +2252,6 @@ public class SharePointRepository extend
// The way I've chosen to do this is to use a triple slash at
that point, as a separator.
String modifiedPath = relPath.substring(0,siteListPath.length())
+ "//" + relPath.substring(siteListPath.length());
- // Evil hack!!!
- // Come up with the ID based on the URL. This SHOULD come from
SharePoint via addFile, above, but
- // this requires a new release of the plugin for SharePoint
2010, and SPSProxyHelper revision and testing on SharePoint 2007.
- String itemRef = relPath.substring(siteListPath.length());
- String itemID;
- if (itemRef.length() > 1)
- {
- int undIndex = itemRef.indexOf("_",1);
- if (undIndex != -1)
- itemID = itemRef.substring(1,undIndex);
- else
- itemID = itemRef.substring(1);
- }
- else
- itemID = null;
-
- if (itemID == null)
- dataValues[4] = new String[0];
- else
- dataValues[4] = new String[]{itemID};
activities.addDocumentReference( modifiedPath,
documentIdentifier, null, listItemStreamDataNames, dataValues );
}
else