Author: kwright
Date: Fri Jan 18 03:53:01 2013
New Revision: 1435017
URL: http://svn.apache.org/viewvc?rev=1435017&view=rev
Log:
Pull up the rest of the fixes for CONNECTORS-613 from trunk.
Modified:
manifoldcf/branches/release-1.1-branch/ (props changed)
manifoldcf/branches/release-1.1-branch/CHANGES.txt
manifoldcf/branches/release-1.1-branch/connectors/documentum/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/DCTM/DCTM.java
manifoldcf/branches/release-1.1-branch/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java
manifoldcf/branches/release-1.1-branch/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java
manifoldcf/branches/release-1.1-branch/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
manifoldcf/branches/release-1.1-branch/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConstants.java
manifoldcf/branches/release-1.1-branch/connectors/sharepoint/ (props
changed)
manifoldcf/branches/release-1.1-branch/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java
manifoldcf/branches/release-1.1-branch/connectors/wiki/ (props changed)
manifoldcf/branches/release-1.1-branch/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
Propchange: manifoldcf/branches/release-1.1-branch/
------------------------------------------------------------------------------
Merged /manifoldcf/trunk:r1435014
Modified: manifoldcf/branches/release-1.1-branch/CHANGES.txt
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/release-1.1-branch/CHANGES.txt?rev=1435017&r1=1435016&r2=1435017&view=diff
==============================================================================
--- manifoldcf/branches/release-1.1-branch/CHANGES.txt (original)
+++ manifoldcf/branches/release-1.1-branch/CHANGES.txt Fri Jan 18 03:53:01 2013
@@ -3,6 +3,11 @@ $Id$
======================= Release 1.1 =====================
+CONNECTORS-613: Add a way of getting a document's mime type
+to Solr, since Tika needs mime type in order to extract content
+since Solr 4.0.0.
+(Shinichiro Abe, Karl Wright)
+
CONNECTORS-614: Solr connection release not working right.
(Karl Wright)
Modified:
manifoldcf/branches/release-1.1-branch/connectors/documentum/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/DCTM/DCTM.java
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/release-1.1-branch/connectors/documentum/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/DCTM/DCTM.java?rev=1435017&r1=1435016&r2=1435017&view=diff
==============================================================================
---
manifoldcf/branches/release-1.1-branch/connectors/documentum/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/DCTM/DCTM.java
(original)
+++
manifoldcf/branches/release-1.1-branch/connectors/documentum/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/DCTM/DCTM.java
Fri Jan 18 03:53:01 2013
@@ -1573,6 +1573,8 @@ public class DCTM extends org.apache.man
String objName = object.getObjectName();
+ String contentType = object.getContentType();
+
// This particular way of getting content failed, because DFC
loaded the
// whole object into memory (very very bad DFC!)
// InputStream is = objIDfSysObject.getContent();
@@ -1609,6 +1611,9 @@ public class DCTM extends org.apache.man
rval = new RepositoryDocument();
+ if (contentType != null)
+ rval.setMimeType(contentType);
+
// Handle the metadata.
// The start of the version string contains the names of the
metadata. We parse it out of the
// version string, because we don't want the chance of somebody
changing something after we got
Modified:
manifoldcf/branches/release-1.1-branch/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/release-1.1-branch/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java?rev=1435017&r1=1435016&r2=1435017&view=diff
==============================================================================
---
manifoldcf/branches/release-1.1-branch/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java
(original)
+++
manifoldcf/branches/release-1.1-branch/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java
Fri Jan 18 03:53:01 2013
@@ -313,13 +313,13 @@ public class FileConnector extends org.a
static {
mimeMap = new HashMap<String,String>();
mimeMap.put("txt","text/plain");
- mimeMap.put(".pdf","application/pdf");
- mimeMap.put(".doc","application/msword");
-
mimeMap.put(".docx","application/vnd.openxmlformats-officedocument.wordprocessingml.document");
- mimeMap.put(".ppt","application/vnd.ms-powerpoint");
-
mimeMap.put(".pptx","application/vnd.openxmlformats-officedocument.presentationml.presentation");
- mimeMap.put(".xls","application/vnd.ms-excel");
-
mimeMap.put(".xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+ mimeMap.put("pdf","application/pdf");
+ mimeMap.put("doc","application/msword");
+
mimeMap.put("docx","application/vnd.openxmlformats-officedocument.wordprocessingml.document");
+ mimeMap.put("ppt","application/vnd.ms-powerpoint");
+
mimeMap.put("pptx","application/vnd.openxmlformats-officedocument.presentationml.presentation");
+ mimeMap.put("xls","application/vnd.ms-excel");
+
mimeMap.put("xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
}
/** Map an extension to a mime type */
Modified:
manifoldcf/branches/release-1.1-branch/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/release-1.1-branch/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java?rev=1435017&r1=1435016&r2=1435017&view=diff
==============================================================================
---
manifoldcf/branches/release-1.1-branch/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java
(original)
+++
manifoldcf/branches/release-1.1-branch/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java
Fri Jan 18 03:53:01 2013
@@ -986,13 +986,13 @@ public class SharedDriveConnector extend
static {
mimeMap = new HashMap<String,String>();
mimeMap.put("txt","text/plain");
- mimeMap.put(".pdf","application/pdf");
- mimeMap.put(".doc","application/msword");
-
mimeMap.put(".docx","application/vnd.openxmlformats-officedocument.wordprocessingml.document");
- mimeMap.put(".ppt","application/vnd.ms-powerpoint");
-
mimeMap.put(".pptx","application/vnd.openxmlformats-officedocument.presentationml.presentation");
- mimeMap.put(".xls","application/vnd.ms-excel");
-
mimeMap.put(".xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+ mimeMap.put("pdf","application/pdf");
+ mimeMap.put("doc","application/msword");
+
mimeMap.put("docx","application/vnd.openxmlformats-officedocument.wordprocessingml.document");
+ mimeMap.put("ppt","application/vnd.ms-powerpoint");
+
mimeMap.put("pptx","application/vnd.openxmlformats-officedocument.presentationml.presentation");
+ mimeMap.put("xls","application/vnd.ms-excel");
+
mimeMap.put("xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
}
/** Map an extension to a mime type */
Modified:
manifoldcf/branches/release-1.1-branch/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/release-1.1-branch/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java?rev=1435017&r1=1435016&r2=1435017&view=diff
==============================================================================
---
manifoldcf/branches/release-1.1-branch/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
(original)
+++
manifoldcf/branches/release-1.1-branch/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
Fri Jan 18 03:53:01 2013
@@ -442,6 +442,7 @@ public class JDBCConnector extends org.a
addConstant(vm,JDBCConstants.idReturnVariable,JDBCConstants.idReturnColumnName);
addConstant(vm,JDBCConstants.urlReturnVariable,JDBCConstants.urlReturnColumnName);
addConstant(vm,JDBCConstants.dataReturnVariable,JDBCConstants.dataReturnColumnName);
+
addConstant(vm,JDBCConstants.contentTypeReturnVariable,JDBCConstants.contentTypeReturnColumnName);
if
(!addIDList(vm,JDBCConstants.idListVariable,documentIdentifiers,scanOnly))
return;
@@ -529,11 +530,24 @@ public class JDBCConnector extends org.a
// We will ingest something, so remove this id from the map in
order that we know what we still
// need to delete when all done.
map.remove(id);
+ String contentType;
+ o = row.getValue(JDBCConstants.contentTypeReturnColumnName);
+ if (o != null)
+ contentType = readAsString(o);
+ else
+ contentType = null;
+
if (contents instanceof BinaryInput)
{
// An ingestion will take place for this document.
RepositoryDocument rd = new RepositoryDocument();
+ // Default content type is application/octet-stream for
binary data
+ if (contentType == null)
+ rd.setMimeType("application/octet-stream");
+ else
+ rd.setMimeType(contentType);
+
applyAccessTokens(rd,version,spec);
applyMetadata(rd,row);
@@ -578,6 +592,12 @@ public class JDBCConnector extends org.a
byte[] bytes = value.getBytes("utf-8");
RepositoryDocument rd = new RepositoryDocument();
+ // Default content type is text/plain for character data
+ if (contentType == null)
+ rd.setMimeType("text/plain");
+ else
+ rd.setMimeType(contentType);
+
applyAccessTokens(rd,version,spec);
applyMetadata(rd,row);
@@ -1382,6 +1402,7 @@ public class JDBCConnector extends org.a
documentKnownColumns.put(JDBCConstants.idReturnColumnName,"");
documentKnownColumns.put(JDBCConstants.urlReturnColumnName,"");
documentKnownColumns.put(JDBCConstants.dataReturnColumnName,"");
+ documentKnownColumns.put(JDBCConstants.contentTypeReturnColumnName,"");
}
/** Apply metadata to a repository document.
Modified:
manifoldcf/branches/release-1.1-branch/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConstants.java
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/release-1.1-branch/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConstants.java?rev=1435017&r1=1435016&r2=1435017&view=diff
==============================================================================
---
manifoldcf/branches/release-1.1-branch/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConstants.java
(original)
+++
manifoldcf/branches/release-1.1-branch/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConstants.java
Fri Jan 18 03:53:01 2013
@@ -52,7 +52,9 @@ public class JDBCConstants
public static String urlReturnColumnName = "lcf__url";
/** The name of the data return column */
public static String dataReturnColumnName = "lcf__data";
-
+ /** The name of the content type return column */
+ public static String contentTypeReturnColumnName = "lcf__contenttype";
+
/** The name of the id return variable */
public static String idReturnVariable = "IDCOLUMN";
/** The name of the version return variable */
@@ -61,6 +63,8 @@ public class JDBCConstants
public static String urlReturnVariable = "URLCOLUMN";
/** The name of the data return variable */
public static String dataReturnVariable = "DATACOLUMN";
+ /** The name of the content type return variable */
+ public static String contentTypeReturnVariable = "CONTENTTYPE";
/** The name of the start time variable */
public static String startTimeVariable = "STARTTIME";
/** The name of the end time variable */
Propchange: manifoldcf/branches/release-1.1-branch/connectors/sharepoint/
------------------------------------------------------------------------------
Merged /manifoldcf/trunk/connectors/sharepoint:r1435014
Modified:
manifoldcf/branches/release-1.1-branch/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/release-1.1-branch/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java?rev=1435017&r1=1435016&r2=1435017&view=diff
==============================================================================
---
manifoldcf/branches/release-1.1-branch/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java
(original)
+++
manifoldcf/branches/release-1.1-branch/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java
Fri Jan 18 03:53:01 2013
@@ -1522,6 +1522,8 @@ public class SharePointRepository extend
RepositoryDocument data = new RepositoryDocument();
data.setBinary( is, documentLength );
+
data.setMimeType(mapExtensionToMimeType(documentIdentifier));
+
setDataACLs(data,acls,denyAcl);
setPathAttribute(data,sDesc,documentIdentifier);
@@ -1708,6 +1710,31 @@ public class SharePointRepository extend
}
}
+ protected final static Map<String,String> mimeMap;
+ static {
+ mimeMap = new HashMap<String,String>();
+ mimeMap.put("txt","text/plain");
+ mimeMap.put("pdf","application/pdf");
+ mimeMap.put("doc","application/msword");
+
mimeMap.put("docx","application/vnd.openxmlformats-officedocument.wordprocessingml.document");
+ mimeMap.put("ppt","application/vnd.ms-powerpoint");
+
mimeMap.put("pptx","application/vnd.openxmlformats-officedocument.presentationml.presentation");
+ mimeMap.put("xls","application/vnd.ms-excel");
+
mimeMap.put("xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+ }
+
+ /** Map an extension to a mime type */
+ protected static String mapExtensionToMimeType(String fileName)
+ {
+ int slashIndex = fileName.lastIndexOf("/");
+ if (slashIndex != -1)
+ fileName = fileName.substring(slashIndex+1);
+ int dotIndex = fileName.lastIndexOf(".");
+ if (dotIndex == -1)
+ return null;
+ return
mimeMap.get(fileName.substring(dotIndex+1).toLowerCase(java.util.Locale.ROOT));
+ }
+
protected static void setDataACLs(RepositoryDocument data, ArrayList acls,
String denyAcl)
{
if (acls != null)
Propchange: manifoldcf/branches/release-1.1-branch/connectors/wiki/
------------------------------------------------------------------------------
Merged /manifoldcf/trunk/connectors/wiki:r1435014
Modified:
manifoldcf/branches/release-1.1-branch/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/release-1.1-branch/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java?rev=1435017&r1=1435016&r2=1435017&view=diff
==============================================================================
---
manifoldcf/branches/release-1.1-branch/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
(original)
+++
manifoldcf/branches/release-1.1-branch/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
Fri Jan 18 03:53:01 2013
@@ -3528,6 +3528,10 @@ public class WikiConnector extends org.a
String lastModified = t.getLastModified();
RepositoryDocument rd = new RepositoryDocument();
+
+ // For wiki, type is always text/plain
+ rd.setMimeType("text/plain");
+
dataSize = contentFile.length();
InputStream is = new FileInputStream(contentFile);
try