Author: kwright
Date: Thu May 11 10:36:57 2017
New Revision: 1794806
URL: http://svn.apache.org/viewvc?rev=1794806&view=rev
Log:
Move tika service functionality to new connector
Added:
manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/
- copied from r1794719,
manifoldcf/branches/CONNECTORS-1425/connectors/tika/
manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/.gitignore
- copied unchanged from r1794722,
manifoldcf/branches/CONNECTORS-1425/connectors/tika/.gitignore
manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
- copied unchanged from r1794722,
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
- copied, changed from r1794772,
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
- copied unchanged from r1794722,
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_es_ES.properties
- copied unchanged from r1794722,
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_es_ES.properties
manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
- copied unchanged from r1794722,
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
- copied unchanged from r1794722,
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification.js
- copied unchanged from r1794722,
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification.js
manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_TikaType.html
- copied unchanged from r1794722,
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_TikaType.html
manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html
- copied unchanged from r1794722,
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html
Removed:
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_TikaType.html
Modified:
manifoldcf/branches/CONNECTORS-1425/connectors/pom.xml
manifoldcf/branches/CONNECTORS-1425/connectors/tika/.gitignore
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_es_ES.properties
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification.js
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html
manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/build.xml
manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/pom.xml
Modified: manifoldcf/branches/CONNECTORS-1425/connectors/pom.xml
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1425/connectors/pom.xml?rev=1794806&r1=1794805&r2=1794806&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1425/connectors/pom.xml (original)
+++ manifoldcf/branches/CONNECTORS-1425/connectors/pom.xml Thu May 11 10:36:57
2017
@@ -63,6 +63,7 @@
<module>amazoncloudsearch</module>
<module>forcedmetadata</module>
<module>tika</module>
+ <module>tikaservice</module>
<module>documentfilter</module>
<module>searchblox</module>
<module>confluence</module>
Modified: manifoldcf/branches/CONNECTORS-1425/connectors/tika/.gitignore
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1425/connectors/tika/.gitignore?rev=1794806&r1=1794805&r2=1794806&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1425/connectors/tika/.gitignore (original)
+++ manifoldcf/branches/CONNECTORS-1425/connectors/tika/.gitignore Thu May 11
10:36:57 2017
@@ -1,4 +1,3 @@
-/target/
/.classpath
-/.settings/
/.project
+/.settings/
Modified:
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java?rev=1794806&r1=1794805&r2=1794806&view=diff
==============================================================================
---
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
(original)
+++
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
Thu May 11 10:36:57 2017
@@ -37,12 +37,5 @@ public class TikaConfig {
public static final String ATTRIBUTE_SOURCE = "source";
public static final String ATTRIBUTE_TARGET = "target";
public static final String ATTRIBUTE_VALUE = "value";
- public static final String TIKAHOSTNAME_DEFAULT = "localhost";
- public static final int TIKAPORT_DEFAULT = 9998;
- public static final String NODE_TIKAHOSTNAME = "tikaHostname";
- public static final String NODE_TIKAPORT = "tikaPort";
- public static final String NODE_TIKASERVER = "tikaServer";
- public static final long TIKARETRY_DEFAULT = 10000;
- public static final String NODE_TIKARETRY = "tikaRetry";
}
Modified:
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java?rev=1794806&r1=1794805&r2=1794806&view=diff
==============================================================================
---
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
(original)
+++
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
Thu May 11 10:36:57 2017
@@ -19,47 +19,30 @@
package org.apache.manifoldcf.agents.transformation.tika;
import org.apache.manifoldcf.core.interfaces.*;
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.io.input.TeeInputStream;
-import org.apache.http.HttpEntity;
-import org.apache.http.HttpHost;
-import org.apache.http.HttpResponse;
-import org.apache.http.client.HttpClient;
-import org.apache.http.client.methods.HttpPost;
-import org.apache.http.client.methods.HttpPut;
-import org.apache.http.entity.InputStreamEntity;
-import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.manifoldcf.agents.interfaces.*;
import org.apache.manifoldcf.agents.system.Logging;
import java.io.*;
-import java.net.URI;
-import java.net.URISyntaxException;
import java.util.*;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
-import org.json.simple.JSONObject;
-import org.json.simple.parser.JSONParser;
-import org.json.simple.parser.ParseException;
import de.l3s.boilerpipe.BoilerpipeExtractor;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-/**
- * This connector works as a transformation connector, but does nothing other
- * than logging.
- *
- */
-public class TikaExtractor extends
org.apache.manifoldcf.agents.transformation.BaseTransformationConnector {
+/** This connector works as a transformation connector, but does nothing other
than logging.
+*
+*/
+public class TikaExtractor extends
org.apache.manifoldcf.agents.transformation.BaseTransformationConnector
+{
public static final String _rcsid = "@(#)$Id$";
private static final String EDIT_SPECIFICATION_JS = "editSpecification.js";
- private static final String EDIT_SPECIFICATION_TIKATYPE_HTML =
"editSpecification_TikaType.html";
private static final String EDIT_SPECIFICATION_FIELDMAPPING_HTML =
"editSpecification_FieldMapping.html";
private static final String EDIT_SPECIFICATION_EXCEPTIONS_HTML =
"editSpecification_Exceptions.html";
private static final String EDIT_SPECIFICATION_BOILERPLATE_HTML =
"editSpecification_Boilerplate.html";
@@ -67,221 +50,159 @@ public class TikaExtractor extends org.a
protected static final String ACTIVITY_EXTRACT = "extract";
- protected static final String[] activitiesList = new String[] {
ACTIVITY_EXTRACT };
-
+ protected static final String[] activitiesList = new
String[]{ACTIVITY_EXTRACT};
+
/** We handle up to 64K in memory; after that we go to disk. */
protected static final long inMemoryMaximumFile = 65536;
-
- /**
- * Return a list of activities that this connector generates. The connector
- * does NOT need to be connected before this method is called.
- *
- * @return the set of activities.
- */
+
+ /** Return a list of activities that this connector generates.
+ * The connector does NOT need to be connected before this method is called.
+ *@return the set of activities.
+ */
@Override
- public String[] getActivitiesList() {
+ public String[] getActivitiesList()
+ {
return activitiesList;
}
- /**
- * Get an output version string, given an output specification. The output
- * version string is used to uniquely describe the pertinent details of the
- * output specification and the configuration, to allow the Connector
- * Framework to determine whether a document will need to be output again.
- * Note that the contents of the document cannot be considered by this
method,
- * and that a different version string (defined in IRepositoryConnector) is
- * used to describe the version of the actual document.
- *
- * This method presumes that the connector object has been configured, and it
- * is thus able to communicate with the output data store should that be
- * necessary.
- *
- * @param os
- * is the current output specification for the job that is doing the
- * crawling.
- * @return a string, of unlimited length, which uniquely describes output
- * configuration and specification in such a way that if two such
- * strings are equal, the document will not need to be sent again to
- * the output data store.
- */
- @Override
- public VersionContext getPipelineDescription(Specification os) throws
ManifoldCFException, ServiceInterruption {
+ /** Get an output version string, given an output specification. The output
version string is used to uniquely describe the pertinent details of
+ * the output specification and the configuration, to allow the Connector
Framework to determine whether a document will need to be output again.
+ * Note that the contents of the document cannot be considered by this
method, and that a different version string (defined in IRepositoryConnector)
+ * is used to describe the version of the actual document.
+ *
+ * This method presumes that the connector object has been configured, and it
is thus able to communicate with the output data store should that be
+ * necessary.
+ *@param os is the current output specification for the job that is doing the
crawling.
+ *@return a string, of unlimited length, which uniquely describes output
configuration and specification in such a way that if two such strings are
equal,
+ * the document will not need to be sent again to the output data store.
+ */
+ @Override
+ public VersionContext getPipelineDescription(Specification os)
+ throws ManifoldCFException, ServiceInterruption
+ {
SpecPacker sp = new SpecPacker(os);
- return new VersionContext(sp.toPackedString(), params, os);
+ return new VersionContext(sp.toPackedString(),params,os);
}
- // We intercept checks pertaining to the document format and send modified
- // checks further down
-
- /**
- * Detect if a mime type is acceptable or not. This method is used to
- * determine whether it makes sense to fetch a document in the first place.
- *
- * @param pipelineDescription
- * is the document's pipeline version string, for this connection.
- * @param mimeType
- * is the mime type of the document.
- * @param checkActivity
- * is an object including the activities that can be performed by
- * this method.
- * @return true if the mime type can be accepted by this connector.
- */
- @Override
- public boolean checkMimeTypeIndexable(VersionContext pipelineDescription,
String mimeType,
- IOutputCheckActivity checkActivity) throws ManifoldCFException,
ServiceInterruption {
+ // We intercept checks pertaining to the document format and send modified
checks further down
+
+ /** Detect if a mime type is acceptable or not. This method is used to
determine whether it makes sense to fetch a document
+ * in the first place.
+ *@param pipelineDescription is the document's pipeline version string, for
this connection.
+ *@param mimeType is the mime type of the document.
+ *@param checkActivity is an object including the activities that can be
performed by this method.
+ *@return true if the mime type can be accepted by this connector.
+ */
+ @Override
+ public boolean checkMimeTypeIndexable(VersionContext pipelineDescription,
String mimeType, IOutputCheckActivity checkActivity)
+ throws ManifoldCFException, ServiceInterruption
+ {
// We should see what Tika will transform
// MHL
// Do a downstream check
return checkActivity.checkMimeTypeIndexable("text/plain;charset=utf-8");
}
- /**
- * Pre-determine whether a document (passed here as a File object) is
- * acceptable or not. This method is used to determine whether a document
- * needs to be actually transferred. This hook is provided mainly to support
- * search engines that only handle a small set of accepted file types.
- *
- * @param pipelineDescription
- * is the document's pipeline version string, for this connection.
- * @param localFile
- * is the local file to check.
- * @param checkActivity
- * is an object including the activities that can be done by this
- * method.
- * @return true if the file is acceptable, false if not.
- */
- @Override
- public boolean checkDocumentIndexable(VersionContext pipelineDescription,
File localFile,
- IOutputCheckActivity checkActivity) throws ManifoldCFException,
ServiceInterruption {
- // Document contents are not germane anymore, unless it looks like Tika
- // won't accept them.
+ /** Pre-determine whether a document (passed here as a File object) is
acceptable or not. This method is
+ * used to determine whether a document needs to be actually transferred.
This hook is provided mainly to support
+ * search engines that only handle a small set of accepted file types.
+ *@param pipelineDescription is the document's pipeline version string, for
this connection.
+ *@param localFile is the local file to check.
+ *@param checkActivity is an object including the activities that can be done
by this method.
+ *@return true if the file is acceptable, false if not.
+ */
+ @Override
+ public boolean checkDocumentIndexable(VersionContext pipelineDescription,
File localFile, IOutputCheckActivity checkActivity)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ // Document contents are not germane anymore, unless it looks like Tika
won't accept them.
// Not sure how to check that...
return true;
}
- /**
- * Pre-determine whether a document's length is acceptable. This method is
- * used to determine whether to fetch a document in the first place.
- *
- * @param pipelineDescription
- * is the document's pipeline version string, for this connection.
- * @param length
- * is the length of the document.
- * @param checkActivity
- * is an object including the activities that can be done by this
- * method.
- * @return true if the file is acceptable, false if not.
- */
- @Override
- public boolean checkLengthIndexable(VersionContext pipelineDescription, long
length,
- IOutputCheckActivity checkActivity) throws ManifoldCFException,
ServiceInterruption {
+ /** Pre-determine whether a document's length is acceptable. This method is
used
+ * to determine whether to fetch a document in the first place.
+ *@param pipelineDescription is the document's pipeline version string, for
this connection.
+ *@param length is the length of the document.
+ *@param checkActivity is an object including the activities that can be done
by this method.
+ *@return true if the file is acceptable, false if not.
+ */
+ @Override
+ public boolean checkLengthIndexable(VersionContext pipelineDescription, long
length, IOutputCheckActivity checkActivity)
+ throws ManifoldCFException, ServiceInterruption
+ {
// Always true
return true;
}
- /**
- * Add (or replace) a document in the output data store using the connector.
- * This method presumes that the connector object has been configured, and it
- * is thus able to communicate with the output data store should that be
- * necessary. The OutputSpecification is *not* provided to this method,
- * because the goal is consistency, and if output is done it must be
- * consistent with the output description, since that was what was partly
used
- * to determine if output should be taking place. So it may be necessary for
- * this method to decode an output description string in order to determine
- * what should be done.
- *
- * @param documentURI
- * is the URI of the document. The URI is presumed to be the unique
- * identifier which the output data store will use to process and
- * serve the document. This URI is constructed by the repository
- * connector which fetches the document, and is thus universal
across
- * all output connectors.
- * @param outputDescription
- * is the description string that was constructed for this document
- * by the getOutputDescription() method.
- * @param document
- * is the document data to be processed (handed to the output data
- * store).
- * @param authorityNameString
- * is the name of the authority responsible for authorizing any
- * access tokens passed in with the repository document. May be
null.
- * @param activities
- * is the handle to an object that the implementer of a pipeline
- * connector may use to perform operations, such as logging
- * processing activity, or sending a modified document to the next
- * stage in the pipeline.
- * @return the document status (accepted or permanently rejected).
- * @throws IOException
- * only if there's a stream error reading the document data.
- */
- @Override
- public int addOrReplaceDocumentWithException(String documentURI,
VersionContext pipelineDescription,
- RepositoryDocument document, String authorityNameString,
IOutputAddActivity activities)
- throws ManifoldCFException, ServiceInterruption, IOException {
- // First, make sure downstream pipeline will now accept
- // text/plain;charset=utf-8
- if (!activities.checkMimeTypeIndexable("text/plain;charset=utf-8")) {
+ /** Add (or replace) a document in the output data store using the connector.
+ * This method presumes that the connector object has been configured, and it
is thus able to communicate with the output data store should that be
+ * necessary.
+ * The OutputSpecification is *not* provided to this method, because the goal
is consistency, and if output is done it must be consistent with the
+ * output description, since that was what was partly used to determine if
output should be taking place. So it may be necessary for this method to decode
+ * an output description string in order to determine what should be done.
+ *@param documentURI is the URI of the document. The URI is presumed to be
the unique identifier which the output data store will use to process
+ * and serve the document. This URI is constructed by the repository
connector which fetches the document, and is thus universal across all output
connectors.
+ *@param outputDescription is the description string that was constructed for
this document by the getOutputDescription() method.
+ *@param document is the document data to be processed (handed to the output
data store).
+ *@param authorityNameString is the name of the authority responsible for
authorizing any access tokens passed in with the repository document. May be
null.
+ *@param activities is the handle to an object that the implementer of a
pipeline connector may use to perform operations, such as logging processing
activity,
+ * or sending a modified document to the next stage in the pipeline.
+ *@return the document status (accepted or permanently rejected).
+ *@throws IOException only if there's a stream error reading the document
data.
+ */
+ @Override
+ public int addOrReplaceDocumentWithException(String documentURI,
VersionContext pipelineDescription, RepositoryDocument document, String
authorityNameString, IOutputAddActivity activities)
+ throws ManifoldCFException, ServiceInterruption, IOException
+ {
+ // First, make sure downstream pipeline will now accept
text/plain;charset=utf-8
+ if (!activities.checkMimeTypeIndexable("text/plain;charset=utf-8"))
+ {
activities.noDocument();
- activities.recordActivity(null, ACTIVITY_EXTRACT, null, documentURI,
activities.EXCLUDED_MIMETYPE,
- "Downstream pipeline rejected mime type 'text/plain;charset=utf-8'");
+ activities.recordActivity(null, ACTIVITY_EXTRACT, null, documentURI,
+ activities.EXCLUDED_MIMETYPE, "Downstream pipeline rejected mime type
'text/plain;charset=utf-8'");
return DOCUMENTSTATUS_REJECTED;
}
SpecPacker sp = new SpecPacker(pipelineDescription.getSpecification());
- // Tika server variables
- String mime = "";
- InputStream tikaServerIs = null;
- int retry = 0;
- HttpResponse response = null;
- IOException tikaServerDownException = null;
-
BoilerpipeExtractor extractorClassInstance =
sp.getExtractorClassInstance();
-
+
// Tika's API reads from an input stream and writes to an output Writer.
- // Since a RepositoryDocument includes readers and inputstreams
exclusively,
- // AND all downstream
- // processing needs to occur in a ManifoldCF thread, we have some
- // constraints on the architecture we need to get this done:
- // (1) The principle worker thread must call the downstream pipeline send()
- // method.
- // (2) The callee of the send() method must call a reader in the Repository
- // Document.
- // (3) The Reader, if its databuffer is empty, must pull more data from the
- // original input stream and hand it to Tika, which populates the Reader's
- // databuffer.
- // So all this can be done in one thread, with some work, and the creation
- // of a special InputStream or Reader implementation. Where it fails,
- // though, is the
- // requirement that tika-extracted metadata be included in the
- // RepositoryDocument right from the beginning. Effectively this means that
- // the entire document
- // must be parsed before it is handed downstream -- so basically a
temporary
- // file (or in-memory buffer if small enough) must be created.
+ // Since a RepositoryDocument includes readers and inputstreams
exclusively, AND all downstream
+ // processing needs to occur in a ManifoldCF thread, we have some
constraints on the architecture we need to get this done:
+ // (1) The principle worker thread must call the downstream pipeline
send() method.
+ // (2) The callee of the send() method must call a reader in the
Repository Document.
+ // (3) The Reader, if its databuffer is empty, must pull more data from
the original input stream and hand it to Tika, which populates the Reader's
databuffer.
+ // So all this can be done in one thread, with some work, and the creation
of a special InputStream or Reader implementation. Where it fails, though, is
the
+ // requirement that tika-extracted metadata be included in the
RepositoryDocument right from the beginning. Effectively this means that the
entire document
+ // must be parsed before it is handed downstream -- so basically a
temporary file (or in-memory buffer if small enough) must be created.
// Instead of the elegant flow above, we have the following:
// (1) Create a temporary file (or in-memory buffer if file is small
enough)
// (2) Run Tika to completion, streaming content output to temporary file
- // (3) Modify RepositoryDocument to read from temporary file, and include
- // Tika-extracted metadata
+ // (3) Modify RepositoryDocument to read from temporary file, and include
Tika-extracted metadata
// (4) Call downstream document processing
-
+
DestinationStorage ds;
-
- if (document.getBinaryLength() <= inMemoryMaximumFile) {
- ds = new MemoryDestinationStorage((int) document.getBinaryLength());
- } else {
+
+ if (document.getBinaryLength() <= inMemoryMaximumFile)
+ {
+ ds = new MemoryDestinationStorage((int)document.getBinaryLength());
+ }
+ else
+ {
ds = new FileDestinationStorage();
}
- try {
+ try
+ {
Metadata metadata = new Metadata();
- if (document.getFileName() != null) {
+ if (document.getFileName() != null)
+ {
metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY,
document.getFileName());
metadata.add("stream_name", document.getFileName());
}
- if (document.getMimeType() != null) {
- mime = document.getMimeType();
- metadata.add("Content-Type", mime);
- }
+ if (document.getMimeType() != null)
+ metadata.add("Content-Type", document.getMimeType());
metadata.add("stream_size", new
Long(document.getBinaryLength()).toString());
// We only log the extraction
@@ -289,470 +210,334 @@ public class TikaExtractor extends org.a
String resultCode = "OK";
String description = null;
Long length = null;
-
- try {
- if (sp.tikaServer) {
- try {
- final HttpClient client = HttpClientBuilder.create().build();
- final HttpHost tikaHost = new HttpHost(sp.tikaHostname,
sp.tikaPort);
-
- // Make a copy of the original stream as it needs to be sent two
- // times to Tika
- // one for the metadata and one for the content
- IOUtils.copy(document.getBinaryStream(), ds.getOutputStream());
- HttpPut httpPut;
- HttpEntity entity;
-
- // Metadata
- httpPut = new HttpPut(sp.metaURI);
- if (!mime.isEmpty()) {
- httpPut.addHeader("Content-Type", mime);
- }
- httpPut.addHeader("Accept", "application/json");
- entity = new InputStreamEntity(ds.getInputStream());
- httpPut.setEntity(entity);
- while (retry < 3 && response == null) {
- try {
- response = client.execute(tikaHost, httpPut);
- tikaServerDownException = null;
- } catch (IOException e) {
- tikaServerDownException = e;
- retry++;
- if (retry < 3) {
- try {
- Thread.sleep(sp.tikaRetry);
- } catch (InterruptedException e1) {
- // Should not happen
- }
- }
- }
- }
- if (tikaServerDownException != null) {
- throw tikaServerDownException;
- }
- int responseCode = response.getStatusLine().getStatusCode();
- if (response.getStatusLine().getStatusCode() == 200 ||
response.getStatusLine().getStatusCode() == 204) {
- tikaServerIs = response.getEntity().getContent();
- try {
- final BufferedReader br = new BufferedReader(new
InputStreamReader(tikaServerIs));
- final JSONParser parser = new JSONParser();
- JSONObject metaJson;
- final StringBuilder sb = new StringBuilder();
- String output;
- while ((output = br.readLine()) != null) {
- sb.append(output);
- }
- metaJson = (JSONObject) parser.parse(sb.toString());
- for (Object key : metaJson.keySet()) {
- metadata.add(key.toString(), metaJson.get(key).toString());
- }
- } finally {
- tikaServerIs.close();
- }
- } else {
- activities.noDocument();
- if (responseCode == 422) {
- resultCode = "TIKASERVERREJECTS";
- description = "Tika Server rejected document with the
following reason: "
- + response.getStatusLine().getReasonPhrase();
- handleTikaServerRejects(description);
- } else {
- resultCode = "TIKASERVERERROR";
- description = "Tika Server failed to parse document with the
following error: "
- + response.getStatusLine().getReasonPhrase();
- handleTikaServerError(description);
- }
- return DOCUMENTSTATUS_REJECTED;
- }
-
- // Content
- httpPut = new HttpPut(sp.contentURI);
- if (!mime.isEmpty()) {
- httpPut.addHeader("Content-Type", mime);
- }
- httpPut.addHeader("Accept", "text/plain");
- entity = new InputStreamEntity(ds.getInputStream());
- httpPut.setEntity(entity);
-
- // Retry mecanism
- retry = 0;
- response = null;
- while (retry < 3 && response == null) {
- try {
- response = client.execute(tikaHost, httpPut);
- tikaServerDownException = null;
- } catch (IOException e) {
- tikaServerDownException = e;
- retry++;
- if (retry < 3) {
- try {
- Thread.sleep(sp.tikaRetry);
- } catch (InterruptedException e1) {
- // Should not happen
- }
- }
- }
- }
- if (tikaServerDownException != null) {
- throw tikaServerDownException;
- }
-
- responseCode = response.getStatusLine().getStatusCode();
- if (response.getStatusLine().getStatusCode() == 200 ||
response.getStatusLine().getStatusCode() == 204) {
- tikaServerIs = response.getEntity().getContent();
- try {
- ds.close();
- ds = new FileDestinationStorage();
- IOUtils.copyLarge(tikaServerIs, ds.getOutputStream(), 0L,
sp.writeLimit);
- length = new Long(ds.getBinaryLength());
- } finally {
- tikaServerIs.close();
- }
- } else {
- activities.noDocument();
- if (responseCode == 422) {
- resultCode = "TIKASERVERREJECTS";
- description = "Tika Server rejected document with the
following reason: "
- + response.getStatusLine().getReasonPhrase();
- handleTikaServerRejects(description);
- } else {
- resultCode = "TIKASERVERERROR";
- description = "Tika Server failed to parse document with the
following error: "
- + response.getStatusLine().getReasonPhrase();
- handleTikaServerError(description);
- }
- return DOCUMENTSTATUS_REJECTED;
- }
-
- } catch (IOException | ParseException e) {
- resultCode = "TIKASERVERRESPONSEISSUE";
- description = e.getMessage();
- int rval;
- if (e instanceof IOException) {
- rval = handleTikaServerException((IOException) e);
- } else {
- rval = handleTikaServerException((ParseException) e);
- }
- if (rval == DOCUMENTSTATUS_REJECTED) {
- activities.noDocument();
- }
- return rval;
- }
- } else {
-
- OutputStream os = ds.getOutputStream();
- try {
- Writer w = new OutputStreamWriter(os, "utf-8");
- try {
- // Use tika to parse stuff
- ContentHandler handler =
TikaParser.newWriteOutBodyContentHandler(w, sp.writeLimit());
- if (extractorClassInstance != null)
- handler = new BoilerpipeContentHandler(handler,
extractorClassInstance);
- try {
- TikaParser.parse(document.getBinaryStream(), metadata,
handler);
- } catch (TikaException e) {
- if (sp.ignoreTikaException()) {
- resultCode =
e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
- description = e.getMessage();
- } else {
- resultCode = "TIKAREJECTION";
- description = e.getMessage();
- int rval = handleTikaException(e);
- if (rval == DOCUMENTSTATUS_REJECTED)
- activities.noDocument();
- return rval;
- }
- } catch (SAXException e) {
+ try
+ {
+ OutputStream os = ds.getOutputStream();
+ try
+ {
+ Writer w = new OutputStreamWriter(os,"utf-8");
+ try
+ {
+ // Use tika to parse stuff
+ ContentHandler handler =
TikaParser.newWriteOutBodyContentHandler(w, sp.writeLimit());
+ if (extractorClassInstance != null)
+ handler = new BoilerpipeContentHandler(handler,
extractorClassInstance);
+ try
+ {
+ TikaParser.parse(document.getBinaryStream(), metadata, handler);
+ }
+ catch (TikaException e)
+ {
+ if (sp.ignoreTikaException())
+ {
resultCode =
e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
description = e.getMessage();
- int rval = handleSaxException(e);
+ }
+ else
+ {
+ resultCode = "TIKAREJECTION";
+ description = e.getMessage();
+ int rval = handleTikaException(e);
if (rval == DOCUMENTSTATUS_REJECTED)
activities.noDocument();
return rval;
- } catch (IOException e) {
- resultCode =
e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
- description = e.getMessage();
- throw e;
}
- } finally {
- w.flush();
}
- } finally {
- os.close();
- length = new Long(ds.getBinaryLength());
+ catch (SAXException e)
+ {
+ resultCode =
e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
+ description = e.getMessage();
+ int rval = handleSaxException(e);
+ if (rval == DOCUMENTSTATUS_REJECTED)
+ activities.noDocument();
+ return rval;
+ }
+ catch (IOException e)
+ {
+ resultCode =
e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
+ description = e.getMessage();
+ throw e;
+ }
+ }
+ finally
+ {
+ w.flush();
}
}
-
- if (!activities.checkLengthIndexable(ds.getBinaryLength())) {
+ finally
+ {
+ os.close();
+ length = new Long(ds.getBinaryLength());
+ }
+
+ // Check to be sure downstream pipeline will accept document of
specified length
+ if (!activities.checkLengthIndexable(ds.getBinaryLength()))
+ {
activities.noDocument();
resultCode = activities.EXCLUDED_LENGTH;
- description = "Downstream pipeline rejected document with length " +
ds.getBinaryLength();
+ description = "Downstream pipeline rejected document with length
"+ds.getBinaryLength();
return DOCUMENTSTATUS_REJECTED;
}
- } finally {
+ }
+ finally
+ {
// Log the extraction processing
- activities.recordActivity(new Long(startTime), ACTIVITY_EXTRACT,
length, documentURI, resultCode, description);
+ activities.recordActivity(new Long(startTime), ACTIVITY_EXTRACT,
length, documentURI,
+ resultCode, description);
}
-
+
// Parsing complete!
// Create a copy of Repository Document
RepositoryDocument docCopy = document.duplicate();
-
+
// Get new stream length
long newBinaryLength = ds.getBinaryLength();
// Open new input stream
InputStream is = ds.getInputStream();
+ try
+ {
+ docCopy.setBinary(is,newBinaryLength);
- try {
- docCopy.setBinary(is, newBinaryLength);
-
- // Set up all metadata from Tika. We may want to run this through a
- // mapper eventually...
+ // Set up all metadata from Tika. We may want to run this through a
mapper eventually...
String[] metaNames = metadata.names();
- for (String mName : metaNames) {
+ for(String mName : metaNames){
String value = metadata.get(mName);
- if (sp.lowerNames()) {
+ if (sp.lowerNames())
+ {
StringBuilder sb = new StringBuilder();
- for (int i = 0; i < mName.length(); i++) {
+ for (int i=0; i<mName.length(); i++) {
char ch = mName.charAt(i);
- if (!Character.isLetterOrDigit(ch))
- ch = '_';
- else
- ch = Character.toLowerCase(ch);
+ if (!Character.isLetterOrDigit(ch)) ch='_';
+ else ch=Character.toLowerCase(ch);
sb.append(ch);
}
mName = sb.toString();
}
String target = sp.getMapping(mName);
- if (target != null) {
+ if(target!=null)
+ {
docCopy.addField(target, value);
- } else {
- if (sp.keepAllMetadata()) {
- docCopy.addField(mName, value);
+ }
+ else
+ {
+ if(sp.keepAllMetadata())
+ {
+ docCopy.addField(mName, value);
}
}
}
// Send new document downstream
- return activities.sendDocument(documentURI, docCopy);
- } finally {
+ return activities.sendDocument(documentURI,docCopy);
+ }
+ finally
+ {
is.close();
}
- } finally {
+ }
+ finally
+ {
ds.close();
}
}
- /**
- * Obtain the name of the form check javascript method to call.
- *
- * @param connectionSequenceNumber
- * is the unique number of this connection within the job.
- * @return the name of the form check javascript method.
- */
- @Override
- public String getFormCheckJavascriptMethodName(int connectionSequenceNumber)
{
- return "s" + connectionSequenceNumber + "_checkSpecification";
- }
-
- /**
- * Obtain the name of the form presave check javascript method to call.
- *
- * @param connectionSequenceNumber
- * is the unique number of this connection within the job.
- * @return the name of the form presave check javascript method.
- */
- @Override
- public String getFormPresaveCheckJavascriptMethodName(int
connectionSequenceNumber) {
- return "s" + connectionSequenceNumber + "_checkSpecificationForSave";
- }
-
- /**
- * Output the specification header section. This method is called in the head
- * section of a job page which has selected a pipeline connection of the
- * current type. Its purpose is to add the required tabs to the list, and to
- * output any javascript methods that might be needed by the job editing
HTML.
- *
- * @param out
- * is the output to which any HTML should be sent.
- * @param locale
- * is the preferred local of the output.
- * @param os
- * is the current pipeline specification for this connection.
- * @param connectionSequenceNumber
- * is the unique number of this connection within the job.
- * @param tabsArray
- * is an array of tab names. Add to this array any tab names that
are
- * specific to the connector.
- */
- @Override
- public void outputSpecificationHeader(IHTTPOutput out, Locale locale,
Specification os, int connectionSequenceNumber,
- List<String> tabsArray) throws ManifoldCFException, IOException {
+ /** Obtain the name of the form check javascript method to call.
+ *@param connectionSequenceNumber is the unique number of this connection
within the job.
+ *@return the name of the form check javascript method.
+ */
+ @Override
+ public String getFormCheckJavascriptMethodName(int connectionSequenceNumber)
+ {
+ return "s"+connectionSequenceNumber+"_checkSpecification";
+ }
+
+ /** Obtain the name of the form presave check javascript method to call.
+ *@param connectionSequenceNumber is the unique number of this connection
within the job.
+ *@return the name of the form presave check javascript method.
+ */
+ @Override
+ public String getFormPresaveCheckJavascriptMethodName(int
connectionSequenceNumber)
+ {
+ return "s"+connectionSequenceNumber+"_checkSpecificationForSave";
+ }
+
+ /** Output the specification header section.
+ * This method is called in the head section of a job page which has selected
a pipeline connection of the current type. Its purpose is to add the required
tabs
+ * to the list, and to output any javascript methods that might be needed by
the job editing HTML.
+ *@param out is the output to which any HTML should be sent.
+ *@param locale is the preferred local of the output.
+ *@param os is the current pipeline specification for this connection.
+ *@param connectionSequenceNumber is the unique number of this connection
within the job.
+ *@param tabsArray is an array of tab names. Add to this array any tab names
that are specific to the connector.
+ */
+ @Override
+ public void outputSpecificationHeader(IHTTPOutput out, Locale locale,
Specification os,
+ int connectionSequenceNumber, List<String> tabsArray)
+ throws ManifoldCFException, IOException
+ {
Map<String, Object> paramMap = new HashMap<String, Object>();
- paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
+ paramMap.put("SEQNUM",Integer.toString(connectionSequenceNumber));
- tabsArray.add(Messages.getString(locale, "TikaExtractor.TikaTypeTabName"));
tabsArray.add(Messages.getString(locale,
"TikaExtractor.FieldMappingTabName"));
tabsArray.add(Messages.getString(locale,
"TikaExtractor.ExceptionsTabName"));
tabsArray.add(Messages.getString(locale,
"TikaExtractor.BoilerplateTabName"));
// Fill in the specification header map, using data from all tabs.
- fillInTikaTypeSpecificationMap(paramMap, os);
fillInFieldMappingSpecificationMap(paramMap, os);
fillInExceptionsSpecificationMap(paramMap, os);
fillInBoilerplateSpecificationMap(paramMap, os);
-
- Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_JS,
paramMap);
+
+
Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_JS,paramMap);
}
-
- /**
- * Output the specification body section. This method is called in the body
- * section of a job page which has selected a pipeline connection of the
- * current type. Its purpose is to present the required form elements for
- * editing. The coder can presume that the HTML that is output from this
- * configuration will be within appropriate <html>, <body>, and <form> tags.
- * The name of the form is "editjob".
- *
- * @param out
- * is the output to which any HTML should be sent.
- * @param locale
- * is the preferred local of the output.
- * @param os
- * is the current pipeline specification for this job.
- * @param connectionSequenceNumber
- * is the unique number of this connection within the job.
- * @param actualSequenceNumber
- * is the connection within the job that has currently been
selected.
- * @param tabName
- * is the current tab name.
- */
- @Override
- public void outputSpecificationBody(IHTTPOutput out, Locale locale,
Specification os, int connectionSequenceNumber,
- int actualSequenceNumber, String tabName) throws ManifoldCFException,
IOException {
+
+ /** Output the specification body section.
+ * This method is called in the body section of a job page which has selected
a pipeline connection of the current type. Its purpose is to present the
required form elements for editing.
+ * The coder can presume that the HTML that is output from this configuration
will be within appropriate <html>, <body>, and <form> tags. The name of the
+ * form is "editjob".
+ *@param out is the output to which any HTML should be sent.
+ *@param locale is the preferred local of the output.
+ *@param os is the current pipeline specification for this job.
+ *@param connectionSequenceNumber is the unique number of this connection
within the job.
+ *@param actualSequenceNumber is the connection within the job that has
currently been selected.
+ *@param tabName is the current tab name.
+ */
+ @Override
+ public void outputSpecificationBody(IHTTPOutput out, Locale locale,
Specification os,
+ int connectionSequenceNumber, int actualSequenceNumber, String tabName)
+ throws ManifoldCFException, IOException
+ {
Map<String, Object> paramMap = new HashMap<String, Object>();
// Set the tab name
paramMap.put("TABNAME", tabName);
- paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
- paramMap.put("SELECTEDNUM", Integer.toString(actualSequenceNumber));
+ paramMap.put("SEQNUM",Integer.toString(connectionSequenceNumber));
+ paramMap.put("SELECTEDNUM",Integer.toString(actualSequenceNumber));
// Fill in the field mapping tab data
- fillInTikaTypeSpecificationMap(paramMap, os);
fillInFieldMappingSpecificationMap(paramMap, os);
fillInExceptionsSpecificationMap(paramMap, os);
fillInBoilerplateSpecificationMap(paramMap, os);
-
- Messages.outputResourceWithVelocity(out, locale,
EDIT_SPECIFICATION_TIKATYPE_HTML, paramMap);
- Messages.outputResourceWithVelocity(out, locale,
EDIT_SPECIFICATION_FIELDMAPPING_HTML, paramMap);
- Messages.outputResourceWithVelocity(out, locale,
EDIT_SPECIFICATION_EXCEPTIONS_HTML, paramMap);
- Messages.outputResourceWithVelocity(out, locale,
EDIT_SPECIFICATION_BOILERPLATE_HTML, paramMap);
- }
-
- /**
- * Process a specification post. This method is called at the start of job's
- * edit or view page, whenever there is a possibility that form data for a
- * connection has been posted. Its purpose is to gather form information and
- * modify the transformation specification accordingly. The name of the
posted
- * form is "editjob".
- *
- * @param variableContext
- * contains the post data, including binary file-upload information.
- * @param locale
- * is the preferred local of the output.
- * @param os
- * is the current pipeline specification for this job.
- * @param connectionSequenceNumber
- * is the unique number of this connection within the job.
- * @return null if all is well, or a string error message if there is an
error
- * that should prevent saving of the job (and cause a redirection to
- * an error page).
- */
+
+
Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_FIELDMAPPING_HTML,paramMap);
+
Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_EXCEPTIONS_HTML,paramMap);
+
Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_BOILERPLATE_HTML,paramMap);
+ }
+
+ /** Process a specification post.
+ * This method is called at the start of job's edit or view page, whenever
there is a possibility that form data for a connection has been
+ * posted. Its purpose is to gather form information and modify the
transformation specification accordingly.
+ * The name of the posted form is "editjob".
+ *@param variableContext contains the post data, including binary file-upload
information.
+ *@param locale is the preferred local of the output.
+ *@param os is the current pipeline specification for this job.
+ *@param connectionSequenceNumber is the unique number of this connection
within the job.
+ *@return null if all is well, or a string error message if there is an error
that should prevent saving of the job (and cause a redirection to an error
page).
+ */
@Override
public String processSpecificationPost(IPostParameters variableContext,
Locale locale, Specification os,
- int connectionSequenceNumber) throws ManifoldCFException {
- String seqPrefix = "s" + connectionSequenceNumber + "_";
+ int connectionSequenceNumber)
+ throws ManifoldCFException {
+ String seqPrefix = "s"+connectionSequenceNumber+"_";
String x;
-
- x = variableContext.getParameter(seqPrefix + "fieldmapping_count");
- if (x != null && x.length() > 0) {
+
+ x = variableContext.getParameter(seqPrefix+"fieldmapping_count");
+ if (x != null && x.length() > 0)
+ {
// About to gather the fieldmapping nodes, so get rid of the old ones.
int i = 0;
- while (i < os.getChildCount()) {
+ while (i < os.getChildCount())
+ {
SpecificationNode node = os.getChild(i);
- if (node.getType().equals(TikaConfig.NODE_FIELDMAP) ||
node.getType().equals(TikaConfig.NODE_KEEPMETADATA)
- || node.getType().equals(TikaConfig.NODE_LOWERNAMES) ||
node.getType().equals(TikaConfig.NODE_WRITELIMIT))
+ if (node.getType().equals(TikaConfig.NODE_FIELDMAP)
+ || node.getType().equals(TikaConfig.NODE_KEEPMETADATA)
+ || node.getType().equals(TikaConfig.NODE_LOWERNAMES)
+ || node.getType().equals(TikaConfig.NODE_WRITELIMIT))
os.removeChild(i);
else
i++;
}
int count = Integer.parseInt(x);
i = 0;
- while (i < count) {
- String prefix = seqPrefix + "fieldmapping_";
- String suffix = "_" + Integer.toString(i);
- String op = variableContext.getParameter(prefix + "op" + suffix);
- if (op == null || !op.equals("Delete")) {
+ while (i < count)
+ {
+ String prefix = seqPrefix+"fieldmapping_";
+ String suffix = "_"+Integer.toString(i);
+ String op = variableContext.getParameter(prefix+"op"+suffix);
+ if (op == null || !op.equals("Delete"))
+ {
// Gather the fieldmap etc.
- String source = variableContext.getParameter(prefix + "source" +
suffix);
- String target = variableContext.getParameter(prefix + "target" +
suffix);
+ String source = variableContext.getParameter(prefix+"source"+suffix);
+ String target = variableContext.getParameter(prefix+"target"+suffix);
if (target == null)
target = "";
SpecificationNode node = new
SpecificationNode(TikaConfig.NODE_FIELDMAP);
- node.setAttribute(TikaConfig.ATTRIBUTE_SOURCE, source);
- node.setAttribute(TikaConfig.ATTRIBUTE_TARGET, target);
- os.addChild(os.getChildCount(), node);
+ node.setAttribute(TikaConfig.ATTRIBUTE_SOURCE,source);
+ node.setAttribute(TikaConfig.ATTRIBUTE_TARGET,target);
+ os.addChild(os.getChildCount(),node);
}
i++;
}
-
- String addop = variableContext.getParameter(seqPrefix +
"fieldmapping_op");
- if (addop != null && addop.equals("Add")) {
- String source = variableContext.getParameter(seqPrefix +
"fieldmapping_source");
- String target = variableContext.getParameter(seqPrefix +
"fieldmapping_target");
+
+ String addop = variableContext.getParameter(seqPrefix+"fieldmapping_op");
+ if (addop != null && addop.equals("Add"))
+ {
+ String source =
variableContext.getParameter(seqPrefix+"fieldmapping_source");
+ String target =
variableContext.getParameter(seqPrefix+"fieldmapping_target");
if (target == null)
target = "";
SpecificationNode node = new
SpecificationNode(TikaConfig.NODE_FIELDMAP);
- node.setAttribute(TikaConfig.ATTRIBUTE_SOURCE, source);
- node.setAttribute(TikaConfig.ATTRIBUTE_TARGET, target);
- os.addChild(os.getChildCount(), node);
+ node.setAttribute(TikaConfig.ATTRIBUTE_SOURCE,source);
+ node.setAttribute(TikaConfig.ATTRIBUTE_TARGET,target);
+ os.addChild(os.getChildCount(),node);
}
-
+
// Gather the keep all metadata parameter to be the last one
SpecificationNode node = new
SpecificationNode(TikaConfig.NODE_KEEPMETADATA);
- String keepAll = variableContext.getParameter(seqPrefix +
"keepallmetadata");
- if (keepAll != null) {
+ String keepAll =
variableContext.getParameter(seqPrefix+"keepallmetadata");
+ if (keepAll != null)
+ {
node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, keepAll);
- } else {
+ }
+ else
+ {
node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "false");
}
- // Add the new keepallmetadata config parameter
+ // Add the new keepallmetadata config parameter
os.addChild(os.getChildCount(), node);
-
+
SpecificationNode node2 = new
SpecificationNode(TikaConfig.NODE_LOWERNAMES);
- String lower = variableContext.getParameter(seqPrefix + "lowernames");
- if (lower != null) {
+ String lower = variableContext.getParameter(seqPrefix+"lowernames");
+ if (lower != null)
+ {
node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, lower);
- } else {
+ }
+ else
+ {
node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "false");
}
os.addChild(os.getChildCount(), node2);
-
+
SpecificationNode node3 = new
SpecificationNode(TikaConfig.NODE_WRITELIMIT);
- String writeLimit = variableContext.getParameter(seqPrefix +
"writelimit");
- if (writeLimit != null) {
+ String writeLimit = variableContext.getParameter(seqPrefix+"writelimit");
+ if (writeLimit != null)
+ {
node3.setAttribute(TikaConfig.ATTRIBUTE_VALUE, writeLimit);
- } else {
+ }
+ else
+ {
node3.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "");
}
os.addChild(os.getChildCount(), node3);
}
-
- if (variableContext.getParameter(seqPrefix +
"ignoretikaexceptions_present") != null) {
+
+ if (variableContext.getParameter(seqPrefix+"ignoretikaexceptions_present")
!= null)
+ {
int i = 0;
- while (i < os.getChildCount()) {
+ while (i < os.getChildCount())
+ {
SpecificationNode node = os.getChild(i);
if (node.getType().equals(TikaConfig.NODE_IGNORETIKAEXCEPTION))
os.removeChild(i);
@@ -760,7 +545,7 @@ public class TikaExtractor extends org.a
i++;
}
- String value = variableContext.getParameter(seqPrefix +
"ignoretikaexceptions");
+ String value =
variableContext.getParameter(seqPrefix+"ignoretikaexceptions");
if (value == null)
value = "false";
@@ -768,11 +553,13 @@ public class TikaExtractor extends org.a
node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, value);
os.addChild(os.getChildCount(), node);
}
-
- x = variableContext.getParameter(seqPrefix + "boilerplateclassname");
- if (x != null) {
+
+ x = variableContext.getParameter(seqPrefix+"boilerplateclassname");
+ if (x != null)
+ {
int i = 0;
- while (i < os.getChildCount()) {
+ while (i < os.getChildCount())
+ {
SpecificationNode node = os.getChild(i);
if (node.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR))
os.removeChild(i);
@@ -780,265 +567,183 @@ public class TikaExtractor extends org.a
i++;
}
- if (x.length() > 0) {
+ if (x.length() > 0)
+ {
SpecificationNode node = new
SpecificationNode(TikaConfig.NODE_BOILERPLATEPROCESSOR);
node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, x);
os.addChild(os.getChildCount(), node);
}
}
-
- x = variableContext.getParameter(seqPrefix + "tikaserver");
- if (x != null) {
- int i = 0;
- while (i < os.getChildCount()) {
- SpecificationNode node = os.getChild(i);
- if (node.getType().equals(TikaConfig.NODE_TIKASERVER) ||
node.getType().equals(TikaConfig.NODE_TIKAHOSTNAME)
- || node.getType().equals(TikaConfig.NODE_TIKAPORT) ||
node.getType().equals(TikaConfig.NODE_TIKARETRY))
- os.removeChild(i);
- else
- i++;
- }
-
- SpecificationNode node = new
SpecificationNode(TikaConfig.NODE_TIKASERVER);
- String tikaServer = variableContext.getParameter(seqPrefix +
"tikaserver");
- if (tikaServer != null) {
- node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, tikaServer);
- } else {
- node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "false");
- }
- // Add the new tikaserver config parameter
- os.addChild(os.getChildCount(), node);
-
- SpecificationNode node2 = new
SpecificationNode(TikaConfig.NODE_TIKAHOSTNAME);
- String tikaHostname = variableContext.getParameter(seqPrefix +
"tikahostname");
- if (tikaHostname != null) {
- node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, tikaHostname);
- } else {
- node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "");
- }
- // Add the new tikahostname config parameter
- os.addChild(os.getChildCount(), node2);
-
- SpecificationNode node3 = new
SpecificationNode(TikaConfig.NODE_TIKAPORT);
- String tikaPort = variableContext.getParameter(seqPrefix + "tikaport");
- if (tikaPort != null) {
- node3.setAttribute(TikaConfig.ATTRIBUTE_VALUE, tikaPort);
- } else {
- node3.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "");
- }
- // Add the new tikaport config parameter
- os.addChild(os.getChildCount(), node3);
-
- SpecificationNode node4 = new
SpecificationNode(TikaConfig.NODE_TIKARETRY);
- String tikaRetry = variableContext.getParameter(seqPrefix + "tikaretry");
- if (tikaRetry != null) {
- node4.setAttribute(TikaConfig.ATTRIBUTE_VALUE, tikaRetry);
- } else {
- node4.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "");
- }
- // Add the new tikaport config parameter
- os.addChild(os.getChildCount(), node4);
- }
-
+
return null;
}
+
- /**
- * View specification. This method is called in the body section of a job's
- * view page. Its purpose is to present the pipeline specification
information
- * to the user. The coder can presume that the HTML that is output from this
- * configuration will be within appropriate <html> and <body> tags.
- *
- * @param out
- * is the output to which any HTML should be sent.
- * @param locale
- * is the preferred local of the output.
- * @param connectionSequenceNumber
- * is the unique number of this connection within the job.
- * @param os
- * is the current pipeline specification for this job.
- */
- @Override
- public void viewSpecification(IHTTPOutput out, Locale locale, Specification
os, int connectionSequenceNumber)
- throws ManifoldCFException, IOException {
+ /** View specification.
+ * This method is called in the body section of a job's view page. Its
purpose is to present the pipeline specification information to the user.
+ * The coder can presume that the HTML that is output from this configuration
will be within appropriate <html> and <body> tags.
+ *@param out is the output to which any HTML should be sent.
+ *@param locale is the preferred local of the output.
+ *@param connectionSequenceNumber is the unique number of this connection
within the job.
+ *@param os is the current pipeline specification for this job.
+ */
+ @Override
+ public void viewSpecification(IHTTPOutput out, Locale locale, Specification
os,
+ int connectionSequenceNumber)
+ throws ManifoldCFException, IOException
+ {
Map<String, Object> paramMap = new HashMap<String, Object>();
- paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
+ paramMap.put("SEQNUM",Integer.toString(connectionSequenceNumber));
// Fill in the map with data from all tabs
- fillInTikaTypeSpecificationMap(paramMap, os);
fillInFieldMappingSpecificationMap(paramMap, os);
fillInExceptionsSpecificationMap(paramMap, os);
fillInBoilerplateSpecificationMap(paramMap, os);
- Messages.outputResourceWithVelocity(out, locale, VIEW_SPECIFICATION_HTML,
paramMap);
-
- }
-
- protected static void fillInTikaTypeSpecificationMap(Map<String, Object>
paramMap, Specification os) {
- String tikaServer = "false";
- String tikaHostname = TikaConfig.TIKAHOSTNAME_DEFAULT;
- String tikaPort = String.valueOf(TikaConfig.TIKAPORT_DEFAULT);
- String tikaRetry = String.valueOf(TikaConfig.TIKARETRY_DEFAULT);
- for (int i = 0; i < os.getChildCount(); i++) {
- SpecificationNode sn = os.getChild(i);
- if (sn.getType().equals(TikaConfig.NODE_TIKASERVER)) {
- tikaServer = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
- } else if (sn.getType().equals(TikaConfig.NODE_TIKAHOSTNAME)) {
- tikaHostname = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
- } else if (sn.getType().equals(TikaConfig.NODE_TIKAPORT)) {
- tikaPort = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
- } else if (sn.getType().equals(TikaConfig.NODE_TIKARETRY)) {
- tikaRetry = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
- }
- }
- paramMap.put("TIKASERVER", tikaServer);
- paramMap.put("TIKAHOSTNAME", tikaHostname);
- paramMap.put("TIKAPORT", tikaPort);
- paramMap.put("TIKARETRY", tikaRetry);
+
Messages.outputResourceWithVelocity(out,locale,VIEW_SPECIFICATION_HTML,paramMap);
+
}
- protected static void fillInFieldMappingSpecificationMap(Map<String, Object>
paramMap, Specification os) {
+ protected static void fillInFieldMappingSpecificationMap(Map<String,Object>
paramMap, Specification os)
+ {
// Prep for field mappings
- List<Map<String, String>> fieldMappings = new ArrayList<Map<String,
String>>();
+ List<Map<String,String>> fieldMappings = new
ArrayList<Map<String,String>>();
String keepAllMetadataValue = "true";
String lowernamesValue = "false";
String writeLimitValue = "";
- for (int i = 0; i < os.getChildCount(); i++) {
+ for (int i = 0; i < os.getChildCount(); i++)
+ {
SpecificationNode sn = os.getChild(i);
if (sn.getType().equals(TikaConfig.NODE_FIELDMAP)) {
String source = sn.getAttributeValue(TikaConfig.ATTRIBUTE_SOURCE);
String target = sn.getAttributeValue(TikaConfig.ATTRIBUTE_TARGET);
String targetDisplay;
- if (target == null) {
+ if (target == null)
+ {
target = "";
targetDisplay = "(remove)";
- } else
+ }
+ else
targetDisplay = target;
- Map<String, String> fieldMapping = new HashMap<String, String>();
- fieldMapping.put("SOURCE", source);
- fieldMapping.put("TARGET", target);
- fieldMapping.put("TARGETDISPLAY", targetDisplay);
+ Map<String,String> fieldMapping = new HashMap<String,String>();
+ fieldMapping.put("SOURCE",source);
+ fieldMapping.put("TARGET",target);
+ fieldMapping.put("TARGETDISPLAY",targetDisplay);
fieldMappings.add(fieldMapping);
- } else if (sn.getType().equals(TikaConfig.NODE_KEEPMETADATA)) {
+ }
+ else if (sn.getType().equals(TikaConfig.NODE_KEEPMETADATA))
+ {
keepAllMetadataValue =
sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
- } else if (sn.getType().equals(TikaConfig.NODE_LOWERNAMES)) {
+ }
+ else if (sn.getType().equals(TikaConfig.NODE_LOWERNAMES))
+ {
lowernamesValue = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
- } else if (sn.getType().equals(TikaConfig.NODE_WRITELIMIT)) {
+ }
+ else if (sn.getType().equals(TikaConfig.NODE_WRITELIMIT))
+ {
writeLimitValue = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
}
}
- paramMap.put("FIELDMAPPINGS", fieldMappings);
- paramMap.put("KEEPALLMETADATA", keepAllMetadataValue);
- paramMap.put("LOWERNAMES", lowernamesValue);
- paramMap.put("WRITELIMIT", writeLimitValue);
+ paramMap.put("FIELDMAPPINGS",fieldMappings);
+ paramMap.put("KEEPALLMETADATA",keepAllMetadataValue);
+ paramMap.put("LOWERNAMES",lowernamesValue);
+ paramMap.put("WRITELIMIT",writeLimitValue);
}
- protected static void fillInExceptionsSpecificationMap(Map<String, Object>
paramMap, Specification os) {
+ protected static void fillInExceptionsSpecificationMap(Map<String,Object>
paramMap, Specification os)
+ {
String ignoreTikaExceptions = "true";
- for (int i = 0; i < os.getChildCount(); i++) {
+ for (int i = 0; i < os.getChildCount(); i++)
+ {
SpecificationNode sn = os.getChild(i);
- if (sn.getType().equals(TikaConfig.NODE_IGNORETIKAEXCEPTION)) {
+ if (sn.getType().equals(TikaConfig.NODE_IGNORETIKAEXCEPTION))
+ {
ignoreTikaExceptions =
sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
}
}
- paramMap.put("IGNORETIKAEXCEPTIONS", ignoreTikaExceptions);
+ paramMap.put("IGNORETIKAEXCEPTIONS",ignoreTikaExceptions);
}
- protected static void fillInBoilerplateSpecificationMap(Map<String, Object>
paramMap, Specification os) {
+ protected static void fillInBoilerplateSpecificationMap(Map<String,Object>
paramMap, Specification os)
+ {
String boilerplateClassName = "";
- for (int i = 0; i < os.getChildCount(); i++) {
+ for (int i = 0; i < os.getChildCount(); i++)
+ {
SpecificationNode sn = os.getChild(i);
- if (sn.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR)) {
+ if (sn.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR))
+ {
boilerplateClassName =
sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
}
}
- paramMap.put("BOILERPLATECLASSNAME", boilerplateClassName);
+ paramMap.put("BOILERPLATECLASSNAME",boilerplateClassName);
}
protected static int handleTikaException(TikaException e)
- throws IOException, ManifoldCFException, ServiceInterruption {
- // MHL - what does Tika throw if it gets an IOException reading the
stream??
- Logging.ingest.warn("Tika: Tika exception extracting: " + e.getMessage(),
e);
- return DOCUMENTSTATUS_REJECTED;
- }
-
- protected static int handleTikaServerRejects(String reason)
- throws IOException, ManifoldCFException, ServiceInterruption {
- // MHL - what does Tika throw if it gets an IOException reading the
stream??
- Logging.ingest.warn("Tika Server: Tika Server rejects: " + reason);
- return DOCUMENTSTATUS_REJECTED;
- }
-
- protected static int handleTikaServerError(String description)
- throws IOException, ManifoldCFException, ServiceInterruption {
- // MHL - what does Tika throw if it gets an IOException reading the
stream??
- Logging.ingest.warn("Tika Server: Tika Server error: " + description);
- return DOCUMENTSTATUS_REJECTED;
- }
-
- protected static int handleTikaServerException(IOException e)
- throws IOException, ManifoldCFException, ServiceInterruption {
+ throws IOException, ManifoldCFException, ServiceInterruption
+ {
// MHL - what does Tika throw if it gets an IOException reading the
stream??
- Logging.ingest.warn("Tika: Tika exception extracting: " + e.getMessage(),
e);
+ Logging.ingest.warn("Tika: Tika exception extracting: "+e.getMessage(),e);
return DOCUMENTSTATUS_REJECTED;
}
-
- protected static int handleTikaServerException(ParseException e)
- throws IOException, ManifoldCFException, ServiceInterruption {
- // MHL - what does Tika throw if it gets an IOException reading the
stream??
- Logging.ingest.warn("Tika: Tika exception extracting: " + e.getMessage(),
e);
- return DOCUMENTSTATUS_REJECTED;
- }
-
- protected static int handleSaxException(SAXException e) throws IOException,
ManifoldCFException, ServiceInterruption {
+
+ protected static int handleSaxException(SAXException e)
+ throws IOException, ManifoldCFException, ServiceInterruption
+ {
// MHL - what does this mean?
- Logging.ingest.warn("Tika: SAX exception extracting: " + e.getMessage(),
e);
+ Logging.ingest.warn("Tika: SAX exception extracting: "+e.getMessage(),e);
return DOCUMENTSTATUS_REJECTED;
}
-
- protected static int handleIOException(IOException e) throws
ManifoldCFException {
+
+ protected static int handleIOException(IOException e)
+ throws ManifoldCFException
+ {
// IOException reading from our local storage...
if (e instanceof InterruptedIOException)
- throw new ManifoldCFException(e.getMessage(), e,
ManifoldCFException.INTERRUPTED);
- throw new ManifoldCFException(e.getMessage(), e);
+ throw new
ManifoldCFException(e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+ throw new ManifoldCFException(e.getMessage(),e);
}
-
- protected static interface DestinationStorage {
- /**
- * Get the output stream to write to. Caller should explicitly close this
- * stream when done writing.
- */
- public OutputStream getOutputStream() throws ManifoldCFException;
-
- /**
- * Get new binary length.
- */
- public long getBinaryLength() throws ManifoldCFException;
-
- /**
- * Get the input stream to read from. Caller should explicitly close this
- * stream when done reading.
- */
- public InputStream getInputStream() throws ManifoldCFException;
-
- /**
- * Close the object and clean up everything. This should be called when the
- * data is no longer needed.
- */
- public void close() throws ManifoldCFException;
- }
-
- protected static class FileDestinationStorage implements DestinationStorage {
+
+ protected static interface DestinationStorage
+ {
+ /** Get the output stream to write to. Caller should explicitly close
this stream when done writing.
+ */
+ public OutputStream getOutputStream()
+ throws ManifoldCFException;
+
+ /** Get new binary length.
+ */
+ public long getBinaryLength()
+ throws ManifoldCFException;
+
+ /** Get the input stream to read from. Caller should explicitly close
this stream when done reading.
+ */
+ public InputStream getInputStream()
+ throws ManifoldCFException;
+
+ /** Close the object and clean up everything.
+ * This should be called when the data is no longer needed.
+ */
+ public void close()
+ throws ManifoldCFException;
+ }
+
+ protected static class FileDestinationStorage implements DestinationStorage
+ {
protected final File outputFile;
protected final OutputStream outputStream;
- public FileDestinationStorage() throws ManifoldCFException {
+ public FileDestinationStorage()
+ throws ManifoldCFException
+ {
File outputFile;
OutputStream outputStream;
- try {
- outputFile = File.createTempFile("mcftika", "tmp");
+ try
+ {
+ outputFile = File.createTempFile("mcftika","tmp");
outputStream = new FileOutputStream(outputFile);
- } catch (IOException e) {
+ }
+ catch (IOException e)
+ {
handleIOException(e);
outputFile = null;
outputStream = null;
@@ -1046,127 +751,121 @@ public class TikaExtractor extends org.a
this.outputFile = outputFile;
this.outputStream = outputStream;
}
-
+
@Override
- public OutputStream getOutputStream() throws ManifoldCFException {
+ public OutputStream getOutputStream()
+ throws ManifoldCFException
+ {
return outputStream;
}
-
- /**
- * Get new binary length.
- */
+
+ /** Get new binary length.
+ */
@Override
- public long getBinaryLength() throws ManifoldCFException {
+ public long getBinaryLength()
+ throws ManifoldCFException
+ {
return outputFile.length();
}
- /**
- * Get the input stream to read from. Caller should explicitly close this
- * stream when done reading.
- */
+ /** Get the input stream to read from. Caller should explicitly close
this stream when done reading.
+ */
@Override
- public InputStream getInputStream() throws ManifoldCFException {
- try {
+ public InputStream getInputStream()
+ throws ManifoldCFException
+ {
+ try
+ {
return new FileInputStream(outputFile);
- } catch (IOException e) {
+ }
+ catch (IOException e)
+ {
handleIOException(e);
return null;
}
}
-
- /**
- * Close the object and clean up everything. This should be called when the
- * data is no longer needed.
- */
+
+ /** Close the object and clean up everything.
+ * This should be called when the data is no longer needed.
+ */
@Override
- public void close() throws ManifoldCFException {
+ public void close()
+ throws ManifoldCFException
+ {
outputFile.delete();
}
}
-
- protected static class MemoryDestinationStorage implements
DestinationStorage {
+
+ protected static class MemoryDestinationStorage implements DestinationStorage
+ {
protected final ByteArrayOutputStream outputStream;
-
- public MemoryDestinationStorage(int sizeHint) {
+
+ public MemoryDestinationStorage(int sizeHint)
+ {
outputStream = new ByteArrayOutputStream(sizeHint);
}
-
+
@Override
- public OutputStream getOutputStream() throws ManifoldCFException {
+ public OutputStream getOutputStream()
+ throws ManifoldCFException
+ {
return outputStream;
}
- /**
- * Get new binary length.
- */
+ /** Get new binary length.
+ */
@Override
- public long getBinaryLength() throws ManifoldCFException {
+ public long getBinaryLength()
+ throws ManifoldCFException
+ {
return outputStream.size();
}
-
- /**
- * Get the input stream to read from. Caller should explicitly close this
- * stream when done reading.
- */
+
+ /** Get the input stream to read from. Caller should explicitly close
this stream when done reading.
+ */
@Override
- public InputStream getInputStream() throws ManifoldCFException {
+ public InputStream getInputStream()
+ throws ManifoldCFException
+ {
return new ByteArrayInputStream(outputStream.toByteArray());
}
-
- /**
- * Close the object and clean up everything. This should be called when the
- * data is no longer needed.
- */
- public void close() throws ManifoldCFException {
+
+ /** Close the object and clean up everything.
+ * This should be called when the data is no longer needed.
+ */
+ public void close()
+ throws ManifoldCFException
+ {
}
}
protected static class SpecPacker {
-
- private final Map<String, String> sourceTargets = new HashMap<String,
String>();
+
+ private final Map<String,String> sourceTargets = new
HashMap<String,String>();
private final boolean keepAllMetadata;
private final boolean lowerNames;
private final int writeLimit;
private final boolean ignoreTikaException;
private final String extractorClassName;
- private URI metaURI;
- private URI contentURI;
- private final String tikaHostname;
- private final int tikaPort;
- private final boolean tikaServer;
- private final long tikaRetry;
-
+
public SpecPacker(Specification os) {
boolean keepAllMetadata = true;
boolean lowerNames = false;
int writeLimit = TikaConfig.WRITELIMIT_DEFAULT;
boolean ignoreTikaException = true;
String extractorClassName = null;
- String tikaHostname = TikaConfig.TIKAHOSTNAME_DEFAULT;
- int tikaPort = TikaConfig.TIKAPORT_DEFAULT;
- boolean tikaServer = false;
- long tikaRetry = TikaConfig.TIKARETRY_DEFAULT;
- try {
- metaURI = new URI("/meta");
- contentURI = new URI("/tika");
- } catch (URISyntaxException e) {
- // Should be impossible
- metaURI = null;
- contentURI = null;
- }
-
for (int i = 0; i < os.getChildCount(); i++) {
SpecificationNode sn = os.getChild(i);
-
- if (sn.getType().equals(TikaConfig.NODE_KEEPMETADATA)) {
+
+ if(sn.getType().equals(TikaConfig.NODE_KEEPMETADATA)) {
String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
keepAllMetadata = Boolean.parseBoolean(value);
- } else if (sn.getType().equals(TikaConfig.NODE_LOWERNAMES)) {
+ } else if(sn.getType().equals(TikaConfig.NODE_LOWERNAMES)) {
String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
lowerNames = Boolean.parseBoolean(value);
- } else if (sn.getType().equals(TikaConfig.NODE_WRITELIMIT)) {
+ } else if(sn.getType().equals(TikaConfig.NODE_WRITELIMIT)) {
String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
if (value.length() == 0) {
writeLimit = TikaConfig.WRITELIMIT_DEFAULT;
@@ -1176,7 +875,7 @@ public class TikaExtractor extends org.a
} else if (sn.getType().equals(TikaConfig.NODE_FIELDMAP)) {
String source = sn.getAttributeValue(TikaConfig.ATTRIBUTE_SOURCE);
String target = sn.getAttributeValue(TikaConfig.ATTRIBUTE_TARGET);
-
+
if (target == null) {
target = "";
}
@@ -1186,34 +885,6 @@ public class TikaExtractor extends org.a
ignoreTikaException = Boolean.parseBoolean(value);
} else if (sn.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR)) {
extractorClassName =
sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
- } else if (sn.getType().equals(TikaConfig.NODE_TIKAHOSTNAME)) {
- String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
- if (value.length() == 0) {
- tikaHostname = TikaConfig.TIKAHOSTNAME_DEFAULT;
- } else {
- tikaHostname = value;
- }
- } else if (sn.getType().equals(TikaConfig.NODE_TIKAPORT)) {
- String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
- if (value.length() == 0) {
- tikaPort = TikaConfig.TIKAPORT_DEFAULT;
- } else {
- tikaPort = Integer.parseInt(value);
- }
- } else if (sn.getType().equals(TikaConfig.NODE_TIKASERVER)) {
- String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
- if (value.length() == 0) {
- tikaServer = false;
- } else {
- tikaServer = Boolean.parseBoolean(value);
- }
- } else if (sn.getType().equals(TikaConfig.NODE_TIKARETRY)) {
- String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
- if (value.length() == 0) {
- tikaRetry = TikaConfig.TIKARETRY_DEFAULT;
- } else {
- tikaRetry = Long.parseLong(value);
- }
}
}
this.keepAllMetadata = keepAllMetadata;
@@ -1221,16 +892,12 @@ public class TikaExtractor extends org.a
this.writeLimit = writeLimit;
this.ignoreTikaException = ignoreTikaException;
this.extractorClassName = extractorClassName;
- this.tikaHostname = tikaHostname;
- this.tikaPort = tikaPort;
- this.tikaServer = tikaServer;
- this.tikaRetry = tikaRetry;
}
-
+
public String toPackedString() {
StringBuilder sb = new StringBuilder();
int i;
-
+
// Mappings
final String[] sortArray = new String[sourceTargets.size()];
i = 0;
@@ -1238,7 +905,7 @@ public class TikaExtractor extends org.a
sortArray[i++] = source;
}
java.util.Arrays.sort(sortArray);
-
+
List<String> packedMappings = new ArrayList<String>();
String[] fixedList = new String[2];
for (String source : sortArray) {
@@ -1246,10 +913,10 @@ public class TikaExtractor extends org.a
StringBuilder localBuffer = new StringBuilder();
fixedList[0] = source;
fixedList[1] = target;
- packFixedList(localBuffer, fixedList, ':');
+ packFixedList(localBuffer,fixedList,':');
packedMappings.add(localBuffer.toString());
}
- packList(sb, packedMappings, '+');
+ packList(sb,packedMappings,'+');
// Keep all metadata
if (keepAllMetadata)
@@ -1257,11 +924,12 @@ public class TikaExtractor extends org.a
else
sb.append('-');
if (lowerNames)
- sb.append('+');
- else
- sb.append('-');
+ sb.append('+');
+ else
+ sb.append('-');
- if (writeLimit != TikaConfig.WRITELIMIT_DEFAULT) {
+ if (writeLimit != TikaConfig.WRITELIMIT_DEFAULT)
+ {
sb.append('+');
sb.append(writeLimit);
}
@@ -1271,60 +939,55 @@ public class TikaExtractor extends org.a
else
sb.append('-');
- if (extractorClassName != null) {
+ if (extractorClassName != null)
+ {
sb.append('+');
sb.append(extractorClassName);
- } else
+ }
+ else
sb.append('-');
-
+
return sb.toString();
}
-
- public URI metaURI() {
- return metaURI;
- }
-
- public URI contentURI() {
- return contentURI;
- }
-
+
public String getMapping(String source) {
return sourceTargets.get(source);
}
-
+
public boolean keepAllMetadata() {
return keepAllMetadata;
}
-
+
public boolean lowerNames() {
return lowerNames;
}
-
+
public int writeLimit() {
return writeLimit;
}
-
+
public boolean ignoreTikaException() {
return ignoreTikaException;
}
-
- public BoilerpipeExtractor getExtractorClassInstance() throws
ManifoldCFException {
+
+ public BoilerpipeExtractor getExtractorClassInstance()
+ throws ManifoldCFException {
if (extractorClassName == null)
return null;
try {
ClassLoader loader = BoilerpipeExtractor.class.getClassLoader();
Class extractorClass = loader.loadClass(extractorClassName);
java.lang.reflect.Field f = extractorClass.getField("INSTANCE");
- return (BoilerpipeExtractor) f.get(null);
+ return (BoilerpipeExtractor)f.get(null);
} catch (ClassNotFoundException e) {
- throw new ManifoldCFException(
- "Boilerpipe extractor class '" + extractorClassName + "' not
found: " + e.getMessage(), e);
+ throw new ManifoldCFException("Boilerpipe extractor class
'"+extractorClassName+"' not found: "+e.getMessage(),e);
} catch (Exception e) {
- throw new ManifoldCFException(
- "Boilerpipe extractor class '" + extractorClassName + "' exception
on instantiation: " + e.getMessage(), e);
+ throw new ManifoldCFException("Boilerpipe extractor class
'"+extractorClassName+"' exception on instantiation: "+e.getMessage(),e);
}
}
}
}
+
+
Modified:
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties?rev=1794806&r1=1794805&r2=1794806&view=diff
==============================================================================
---
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
(original)
+++
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
Thu May 11 10:36:57 2017
@@ -13,13 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-TikaExtractor.TikaHostname=Tika hostname:
-TikaExtractor.TikaPort=Tika port:
-TikaExtractor.TikaRetry=Retry interval (ms):
-TikaExtractor.TikaParsersSelected=Tika Parsers
-TikaExtractor.TikaServerSelected=Tika Server
-TikaExtractor.TikaTypeTabName=Tika type
-TikaExtractor.TikaType=Tika type:
TikaExtractor.FieldMappingTabName=Field mapping
TikaExtractor.ExceptionsTabName=Exceptions
TikaExtractor.BoilerplateTabName=Boilerplate