Author: kwright
Date: Wed May 10 13:27:33 2017
New Revision: 1794722
URL: http://svn.apache.org/viewvc?rev=1794722&view=rev
Log:
Add Tika external access functionality
Added:
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_TikaType.html
Modified:
manifoldcf/branches/CONNECTORS-1425/connectors/tika/.gitignore
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_es_ES.properties
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification.js
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html
Modified: manifoldcf/branches/CONNECTORS-1425/connectors/tika/.gitignore
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1425/connectors/tika/.gitignore?rev=1794722&r1=1794721&r2=1794722&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1425/connectors/tika/.gitignore (original)
+++ manifoldcf/branches/CONNECTORS-1425/connectors/tika/.gitignore Wed May 10
13:27:33 2017
@@ -1,3 +1,4 @@
+/target/
/.classpath
-/.project
/.settings/
+/.project
Modified:
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java?rev=1794722&r1=1794721&r2=1794722&view=diff
==============================================================================
---
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
(original)
+++
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
Wed May 10 13:27:33 2017
@@ -37,5 +37,12 @@ public class TikaConfig {
public static final String ATTRIBUTE_SOURCE = "source";
public static final String ATTRIBUTE_TARGET = "target";
public static final String ATTRIBUTE_VALUE = "value";
+ public static final String TIKAHOSTNAME_DEFAULT = "localhost";
+ public static final int TIKAPORT_DEFAULT = 9998;
+ public static final String NODE_TIKAHOSTNAME = "tikaHostname";
+ public static final String NODE_TIKAPORT = "tikaPort";
+ public static final String NODE_TIKASERVER = "tikaServer";
+ public static final long TIKARETRY_DEFAULT = 10000;
+ public static final String NODE_TIKARETRY = "tikaRetry";
}
Modified:
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java?rev=1794722&r1=1794721&r2=1794722&view=diff
==============================================================================
---
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
(original)
+++
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
Wed May 10 13:27:33 2017
@@ -19,30 +19,48 @@
package org.apache.manifoldcf.agents.transformation.tika;
import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.TeeInputStream;
+import org.apache.http.HttpEntity;
+import org.apache.http.HttpHost;
+import org.apache.http.HttpResponse;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.methods.HttpPost;
+import org.apache.http.client.methods.HttpPut;
+import org.apache.http.entity.InputStreamEntity;
+import org.apache.http.entity.mime.Header;
+import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.manifoldcf.agents.interfaces.*;
import org.apache.manifoldcf.agents.system.Logging;
import java.io.*;
+import java.net.URI;
+import java.net.URISyntaxException;
import java.util.*;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.json.simple.parser.ParseException;
import de.l3s.boilerpipe.BoilerpipeExtractor;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-/** This connector works as a transformation connector, but does nothing other
than logging.
-*
-*/
-public class TikaExtractor extends
org.apache.manifoldcf.agents.transformation.BaseTransformationConnector
-{
+/**
+ * This connector works as a transformation connector, but does nothing other
+ * than logging.
+ *
+ */
+public class TikaExtractor extends
org.apache.manifoldcf.agents.transformation.BaseTransformationConnector {
public static final String _rcsid = "@(#)$Id$";
private static final String EDIT_SPECIFICATION_JS = "editSpecification.js";
+ private static final String EDIT_SPECIFICATION_TIKATYPE_HTML =
"editSpecification_TikaType.html";
private static final String EDIT_SPECIFICATION_FIELDMAPPING_HTML =
"editSpecification_FieldMapping.html";
private static final String EDIT_SPECIFICATION_EXCEPTIONS_HTML =
"editSpecification_Exceptions.html";
private static final String EDIT_SPECIFICATION_BOILERPLATE_HTML =
"editSpecification_Boilerplate.html";
@@ -50,159 +68,221 @@ public class TikaExtractor extends org.a
protected static final String ACTIVITY_EXTRACT = "extract";
- protected static final String[] activitiesList = new
String[]{ACTIVITY_EXTRACT};
-
+ protected static final String[] activitiesList = new String[] {
ACTIVITY_EXTRACT };
+
/** We handle up to 64K in memory; after that we go to disk. */
protected static final long inMemoryMaximumFile = 65536;
-
- /** Return a list of activities that this connector generates.
- * The connector does NOT need to be connected before this method is called.
- *@return the set of activities.
- */
+
+ /**
+ * Return a list of activities that this connector generates. The connector
+ * does NOT need to be connected before this method is called.
+ *
+ * @return the set of activities.
+ */
@Override
- public String[] getActivitiesList()
- {
+ public String[] getActivitiesList() {
return activitiesList;
}
- /** Get an output version string, given an output specification. The output
version string is used to uniquely describe the pertinent details of
- * the output specification and the configuration, to allow the Connector
Framework to determine whether a document will need to be output again.
- * Note that the contents of the document cannot be considered by this
method, and that a different version string (defined in IRepositoryConnector)
- * is used to describe the version of the actual document.
- *
- * This method presumes that the connector object has been configured, and it
is thus able to communicate with the output data store should that be
- * necessary.
- *@param os is the current output specification for the job that is doing the
crawling.
- *@return a string, of unlimited length, which uniquely describes output
configuration and specification in such a way that if two such strings are
equal,
- * the document will not need to be sent again to the output data store.
- */
- @Override
- public VersionContext getPipelineDescription(Specification os)
- throws ManifoldCFException, ServiceInterruption
- {
+ /**
+ * Get an output version string, given an output specification. The output
+ * version string is used to uniquely describe the pertinent details of the
+ * output specification and the configuration, to allow the Connector
+ * Framework to determine whether a document will need to be output again.
+ * Note that the contents of the document cannot be considered by this
method,
+ * and that a different version string (defined in IRepositoryConnector) is
+ * used to describe the version of the actual document.
+ *
+ * This method presumes that the connector object has been configured, and it
+ * is thus able to communicate with the output data store should that be
+ * necessary.
+ *
+ * @param os
+ * is the current output specification for the job that is doing the
+ * crawling.
+ * @return a string, of unlimited length, which uniquely describes output
+ * configuration and specification in such a way that if two such
+ * strings are equal, the document will not need to be sent again to
+ * the output data store.
+ */
+ @Override
+ public VersionContext getPipelineDescription(Specification os) throws
ManifoldCFException, ServiceInterruption {
SpecPacker sp = new SpecPacker(os);
- return new VersionContext(sp.toPackedString(),params,os);
+ return new VersionContext(sp.toPackedString(), params, os);
}
- // We intercept checks pertaining to the document format and send modified
checks further down
-
- /** Detect if a mime type is acceptable or not. This method is used to
determine whether it makes sense to fetch a document
- * in the first place.
- *@param pipelineDescription is the document's pipeline version string, for
this connection.
- *@param mimeType is the mime type of the document.
- *@param checkActivity is an object including the activities that can be
performed by this method.
- *@return true if the mime type can be accepted by this connector.
- */
- @Override
- public boolean checkMimeTypeIndexable(VersionContext pipelineDescription,
String mimeType, IOutputCheckActivity checkActivity)
- throws ManifoldCFException, ServiceInterruption
- {
+ // We intercept checks pertaining to the document format and send modified
+ // checks further down
+
+ /**
+ * Detect if a mime type is acceptable or not. This method is used to
+ * determine whether it makes sense to fetch a document in the first place.
+ *
+ * @param pipelineDescription
+ * is the document's pipeline version string, for this connection.
+ * @param mimeType
+ * is the mime type of the document.
+ * @param checkActivity
+ * is an object including the activities that can be performed by
+ * this method.
+ * @return true if the mime type can be accepted by this connector.
+ */
+ @Override
+ public boolean checkMimeTypeIndexable(VersionContext pipelineDescription,
String mimeType,
+ IOutputCheckActivity checkActivity) throws ManifoldCFException,
ServiceInterruption {
// We should see what Tika will transform
// MHL
// Do a downstream check
return checkActivity.checkMimeTypeIndexable("text/plain;charset=utf-8");
}
- /** Pre-determine whether a document (passed here as a File object) is
acceptable or not. This method is
- * used to determine whether a document needs to be actually transferred.
This hook is provided mainly to support
- * search engines that only handle a small set of accepted file types.
- *@param pipelineDescription is the document's pipeline version string, for
this connection.
- *@param localFile is the local file to check.
- *@param checkActivity is an object including the activities that can be done
by this method.
- *@return true if the file is acceptable, false if not.
- */
- @Override
- public boolean checkDocumentIndexable(VersionContext pipelineDescription,
File localFile, IOutputCheckActivity checkActivity)
- throws ManifoldCFException, ServiceInterruption
- {
- // Document contents are not germane anymore, unless it looks like Tika
won't accept them.
+ /**
+ * Pre-determine whether a document (passed here as a File object) is
+ * acceptable or not. This method is used to determine whether a document
+ * needs to be actually transferred. This hook is provided mainly to support
+ * search engines that only handle a small set of accepted file types.
+ *
+ * @param pipelineDescription
+ * is the document's pipeline version string, for this connection.
+ * @param localFile
+ * is the local file to check.
+ * @param checkActivity
+ * is an object including the activities that can be done by this
+ * method.
+ * @return true if the file is acceptable, false if not.
+ */
+ @Override
+ public boolean checkDocumentIndexable(VersionContext pipelineDescription,
File localFile,
+ IOutputCheckActivity checkActivity) throws ManifoldCFException,
ServiceInterruption {
+ // Document contents are not germane anymore, unless it looks like Tika
+ // won't accept them.
// Not sure how to check that...
return true;
}
- /** Pre-determine whether a document's length is acceptable. This method is
used
- * to determine whether to fetch a document in the first place.
- *@param pipelineDescription is the document's pipeline version string, for
this connection.
- *@param length is the length of the document.
- *@param checkActivity is an object including the activities that can be done
by this method.
- *@return true if the file is acceptable, false if not.
- */
- @Override
- public boolean checkLengthIndexable(VersionContext pipelineDescription, long
length, IOutputCheckActivity checkActivity)
- throws ManifoldCFException, ServiceInterruption
- {
+ /**
+ * Pre-determine whether a document's length is acceptable. This method is
+ * used to determine whether to fetch a document in the first place.
+ *
+ * @param pipelineDescription
+ * is the document's pipeline version string, for this connection.
+ * @param length
+ * is the length of the document.
+ * @param checkActivity
+ * is an object including the activities that can be done by this
+ * method.
+ * @return true if the file is acceptable, false if not.
+ */
+ @Override
+ public boolean checkLengthIndexable(VersionContext pipelineDescription, long
length,
+ IOutputCheckActivity checkActivity) throws ManifoldCFException,
ServiceInterruption {
// Always true
return true;
}
- /** Add (or replace) a document in the output data store using the connector.
- * This method presumes that the connector object has been configured, and it
is thus able to communicate with the output data store should that be
- * necessary.
- * The OutputSpecification is *not* provided to this method, because the goal
is consistency, and if output is done it must be consistent with the
- * output description, since that was what was partly used to determine if
output should be taking place. So it may be necessary for this method to decode
- * an output description string in order to determine what should be done.
- *@param documentURI is the URI of the document. The URI is presumed to be
the unique identifier which the output data store will use to process
- * and serve the document. This URI is constructed by the repository
connector which fetches the document, and is thus universal across all output
connectors.
- *@param outputDescription is the description string that was constructed for
this document by the getOutputDescription() method.
- *@param document is the document data to be processed (handed to the output
data store).
- *@param authorityNameString is the name of the authority responsible for
authorizing any access tokens passed in with the repository document. May be
null.
- *@param activities is the handle to an object that the implementer of a
pipeline connector may use to perform operations, such as logging processing
activity,
- * or sending a modified document to the next stage in the pipeline.
- *@return the document status (accepted or permanently rejected).
- *@throws IOException only if there's a stream error reading the document
data.
- */
- @Override
- public int addOrReplaceDocumentWithException(String documentURI,
VersionContext pipelineDescription, RepositoryDocument document, String
authorityNameString, IOutputAddActivity activities)
- throws ManifoldCFException, ServiceInterruption, IOException
- {
- // First, make sure downstream pipeline will now accept
text/plain;charset=utf-8
- if (!activities.checkMimeTypeIndexable("text/plain;charset=utf-8"))
- {
+ /**
+ * Add (or replace) a document in the output data store using the connector.
+ * This method presumes that the connector object has been configured, and it
+ * is thus able to communicate with the output data store should that be
+ * necessary. The OutputSpecification is *not* provided to this method,
+ * because the goal is consistency, and if output is done it must be
+ * consistent with the output description, since that was what was partly
used
+ * to determine if output should be taking place. So it may be necessary for
+ * this method to decode an output description string in order to determine
+ * what should be done.
+ *
+ * @param documentURI
+ * is the URI of the document. The URI is presumed to be the unique
+ * identifier which the output data store will use to process and
+ * serve the document. This URI is constructed by the repository
+ * connector which fetches the document, and is thus universal
across
+ * all output connectors.
+ * @param outputDescription
+ * is the description string that was constructed for this document
+ * by the getOutputDescription() method.
+ * @param document
+ * is the document data to be processed (handed to the output data
+ * store).
+ * @param authorityNameString
+ * is the name of the authority responsible for authorizing any
+ * access tokens passed in with the repository document. May be
null.
+ * @param activities
+ * is the handle to an object that the implementer of a pipeline
+ * connector may use to perform operations, such as logging
+ * processing activity, or sending a modified document to the next
+ * stage in the pipeline.
+ * @return the document status (accepted or permanently rejected).
+ * @throws IOException
+ * only if there's a stream error reading the document data.
+ */
+ @Override
+ public int addOrReplaceDocumentWithException(String documentURI,
VersionContext pipelineDescription,
+ RepositoryDocument document, String authorityNameString,
IOutputAddActivity activities)
+ throws ManifoldCFException, ServiceInterruption, IOException {
+ // First, make sure downstream pipeline will now accept
+ // text/plain;charset=utf-8
+ if (!activities.checkMimeTypeIndexable("text/plain;charset=utf-8")) {
activities.noDocument();
- activities.recordActivity(null, ACTIVITY_EXTRACT, null, documentURI,
- activities.EXCLUDED_MIMETYPE, "Downstream pipeline rejected mime type
'text/plain;charset=utf-8'");
+ activities.recordActivity(null, ACTIVITY_EXTRACT, null, documentURI,
activities.EXCLUDED_MIMETYPE,
+ "Downstream pipeline rejected mime type 'text/plain;charset=utf-8'");
return DOCUMENTSTATUS_REJECTED;
}
SpecPacker sp = new SpecPacker(pipelineDescription.getSpecification());
+ // Tika server variables
+ String mime = "";
+ InputStream tikaServerIs = null;
+ int retry = 0;
+ HttpResponse response = null;
+ IOException tikaServerDownException = null;
+
BoilerpipeExtractor extractorClassInstance =
sp.getExtractorClassInstance();
-
+
// Tika's API reads from an input stream and writes to an output Writer.
- // Since a RepositoryDocument includes readers and inputstreams
exclusively, AND all downstream
- // processing needs to occur in a ManifoldCF thread, we have some
constraints on the architecture we need to get this done:
- // (1) The principle worker thread must call the downstream pipeline
send() method.
- // (2) The callee of the send() method must call a reader in the
Repository Document.
- // (3) The Reader, if its databuffer is empty, must pull more data from
the original input stream and hand it to Tika, which populates the Reader's
databuffer.
- // So all this can be done in one thread, with some work, and the creation
of a special InputStream or Reader implementation. Where it fails, though, is
the
- // requirement that tika-extracted metadata be included in the
RepositoryDocument right from the beginning. Effectively this means that the
entire document
- // must be parsed before it is handed downstream -- so basically a
temporary file (or in-memory buffer if small enough) must be created.
+ // Since a RepositoryDocument includes readers and inputstreams
exclusively,
+ // AND all downstream
+ // processing needs to occur in a ManifoldCF thread, we have some
+ // constraints on the architecture we need to get this done:
+ // (1) The principle worker thread must call the downstream pipeline send()
+ // method.
+ // (2) The callee of the send() method must call a reader in the Repository
+ // Document.
+ // (3) The Reader, if its databuffer is empty, must pull more data from the
+ // original input stream and hand it to Tika, which populates the Reader's
+ // databuffer.
+ // So all this can be done in one thread, with some work, and the creation
+ // of a special InputStream or Reader implementation. Where it fails,
+ // though, is the
+ // requirement that tika-extracted metadata be included in the
+ // RepositoryDocument right from the beginning. Effectively this means that
+ // the entire document
+ // must be parsed before it is handed downstream -- so basically a
temporary
+ // file (or in-memory buffer if small enough) must be created.
// Instead of the elegant flow above, we have the following:
// (1) Create a temporary file (or in-memory buffer if file is small
enough)
// (2) Run Tika to completion, streaming content output to temporary file
- // (3) Modify RepositoryDocument to read from temporary file, and include
Tika-extracted metadata
+ // (3) Modify RepositoryDocument to read from temporary file, and include
+ // Tika-extracted metadata
// (4) Call downstream document processing
-
+
DestinationStorage ds;
-
- if (document.getBinaryLength() <= inMemoryMaximumFile)
- {
- ds = new MemoryDestinationStorage((int)document.getBinaryLength());
- }
- else
- {
+
+ if (document.getBinaryLength() <= inMemoryMaximumFile) {
+ ds = new MemoryDestinationStorage((int) document.getBinaryLength());
+ } else {
ds = new FileDestinationStorage();
}
- try
- {
+ try {
Metadata metadata = new Metadata();
- if (document.getFileName() != null)
- {
+ if (document.getFileName() != null) {
metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY,
document.getFileName());
metadata.add("stream_name", document.getFileName());
}
- if (document.getMimeType() != null)
- metadata.add("Content-Type", document.getMimeType());
+ if (document.getMimeType() != null) {
+ mime = document.getMimeType();
+ metadata.add("Content-Type", mime);
+ }
metadata.add("stream_size", new
Long(document.getBinaryLength()).toString());
// We only log the extraction
@@ -210,334 +290,470 @@ public class TikaExtractor extends org.a
String resultCode = "OK";
String description = null;
Long length = null;
- try
- {
- OutputStream os = ds.getOutputStream();
- try
- {
- Writer w = new OutputStreamWriter(os,"utf-8");
- try
- {
- // Use tika to parse stuff
- ContentHandler handler =
TikaParser.newWriteOutBodyContentHandler(w, sp.writeLimit());
- if (extractorClassInstance != null)
- handler = new BoilerpipeContentHandler(handler,
extractorClassInstance);
- try
- {
- TikaParser.parse(document.getBinaryStream(), metadata, handler);
- }
- catch (TikaException e)
- {
- if (sp.ignoreTikaException())
- {
- resultCode =
e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
- description = e.getMessage();
+
+ try {
+ if (sp.tikaServer) {
+ try {
+ final HttpClient client = HttpClientBuilder.create().build();
+ final HttpHost tikaHost = new HttpHost(sp.tikaHostname,
sp.tikaPort);
+
+ // Make a copy of the original stream as it needs to be sent two
+ // times to Tika
+ // one for the metadata and one for the content
+ IOUtils.copy(document.getBinaryStream(), ds.getOutputStream());
+ HttpPut httpPut;
+ HttpEntity entity;
+
+ // Metadata
+ httpPut = new HttpPut(sp.metaURI);
+ if (!mime.isEmpty()) {
+ httpPut.addHeader("Content-Type", mime);
+ }
+ httpPut.addHeader("Accept", "application/json");
+ entity = new InputStreamEntity(ds.getInputStream());
+ httpPut.setEntity(entity);
+ while (retry < 3 && response == null) {
+ try {
+ response = client.execute(tikaHost, httpPut);
+ tikaServerDownException = null;
+ } catch (IOException e) {
+ tikaServerDownException = e;
+ retry++;
+ if (retry < 3) {
+ try {
+ Thread.sleep(sp.tikaRetry);
+ } catch (InterruptedException e1) {
+ // Should not happen
+ }
+ }
}
- else
- {
- resultCode = "TIKAREJECTION";
+ }
+ if (tikaServerDownException != null) {
+ throw tikaServerDownException;
+ }
+ int responseCode = response.getStatusLine().getStatusCode();
+ if (response.getStatusLine().getStatusCode() == 200 ||
response.getStatusLine().getStatusCode() == 204) {
+ tikaServerIs = response.getEntity().getContent();
+ try {
+ final BufferedReader br = new BufferedReader(new
InputStreamReader(tikaServerIs));
+ final JSONParser parser = new JSONParser();
+ JSONObject metaJson;
+ final StringBuilder sb = new StringBuilder();
+ String output;
+ while ((output = br.readLine()) != null) {
+ sb.append(output);
+ }
+ metaJson = (JSONObject) parser.parse(sb.toString());
+ for (Object key : metaJson.keySet()) {
+ metadata.add(key.toString(), metaJson.get(key).toString());
+ }
+ } finally {
+ tikaServerIs.close();
+ }
+ } else {
+ activities.noDocument();
+ if (responseCode == 422) {
+ resultCode = "TIKASERVERREJECTS";
+ description = "Tika Server rejected document with the
following reason: "
+ + response.getStatusLine().getReasonPhrase();
+ handleTikaServerRejects(description);
+ } else {
+ resultCode = "TIKASERVERERROR";
+ description = "Tika Server failed to parse document with the
following error: "
+ + response.getStatusLine().getReasonPhrase();
+ handleTikaServerError(description);
+ }
+ return DOCUMENTSTATUS_REJECTED;
+ }
+
+ // Content
+ httpPut = new HttpPut(sp.contentURI);
+ if (!mime.isEmpty()) {
+ httpPut.addHeader("Content-Type", mime);
+ }
+ httpPut.addHeader("Accept", "text/plain");
+ entity = new InputStreamEntity(ds.getInputStream());
+ httpPut.setEntity(entity);
+
+ // Retry mecanism
+ retry = 0;
+ response = null;
+ while (retry < 3 && response == null) {
+ try {
+ response = client.execute(tikaHost, httpPut);
+ tikaServerDownException = null;
+ } catch (IOException e) {
+ tikaServerDownException = e;
+ retry++;
+ if (retry < 3) {
+ try {
+ Thread.sleep(sp.tikaRetry);
+ } catch (InterruptedException e1) {
+ // Should not happen
+ }
+ }
+ }
+ }
+ if (tikaServerDownException != null) {
+ throw tikaServerDownException;
+ }
+
+ responseCode = response.getStatusLine().getStatusCode();
+ if (response.getStatusLine().getStatusCode() == 200 ||
response.getStatusLine().getStatusCode() == 204) {
+ tikaServerIs = response.getEntity().getContent();
+ try {
+ ds.close();
+ ds = new FileDestinationStorage();
+ IOUtils.copyLarge(tikaServerIs, ds.getOutputStream(), 0L,
sp.writeLimit);
+ length = new Long(ds.getBinaryLength());
+ } finally {
+ tikaServerIs.close();
+ }
+ } else {
+ activities.noDocument();
+ if (responseCode == 422) {
+ resultCode = "TIKASERVERREJECTS";
+ description = "Tika Server rejected document with the
following reason: "
+ + response.getStatusLine().getReasonPhrase();
+ handleTikaServerRejects(description);
+ } else {
+ resultCode = "TIKASERVERERROR";
+ description = "Tika Server failed to parse document with the
following error: "
+ + response.getStatusLine().getReasonPhrase();
+ handleTikaServerError(description);
+ }
+ return DOCUMENTSTATUS_REJECTED;
+ }
+
+ } catch (IOException | ParseException e) {
+ resultCode = "TIKASERVERRESPONSEISSUE";
+ description = e.getMessage();
+ int rval;
+ if (e instanceof IOException) {
+ rval = handleTikaServerException((IOException) e);
+ } else {
+ rval = handleTikaServerException((ParseException) e);
+ }
+ if (rval == DOCUMENTSTATUS_REJECTED) {
+ activities.noDocument();
+ }
+ return rval;
+ }
+ } else {
+
+ OutputStream os = ds.getOutputStream();
+ try {
+ Writer w = new OutputStreamWriter(os, "utf-8");
+ try {
+ // Use tika to parse stuff
+ ContentHandler handler =
TikaParser.newWriteOutBodyContentHandler(w, sp.writeLimit());
+ if (extractorClassInstance != null)
+ handler = new BoilerpipeContentHandler(handler,
extractorClassInstance);
+ try {
+ TikaParser.parse(document.getBinaryStream(), metadata,
handler);
+ } catch (TikaException e) {
+ if (sp.ignoreTikaException()) {
+ resultCode =
e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
+ description = e.getMessage();
+ } else {
+ resultCode = "TIKAREJECTION";
+ description = e.getMessage();
+ int rval = handleTikaException(e);
+ if (rval == DOCUMENTSTATUS_REJECTED)
+ activities.noDocument();
+ return rval;
+ }
+ } catch (SAXException e) {
+ resultCode =
e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
description = e.getMessage();
- int rval = handleTikaException(e);
+ int rval = handleSaxException(e);
if (rval == DOCUMENTSTATUS_REJECTED)
activities.noDocument();
return rval;
+ } catch (IOException e) {
+ resultCode =
e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
+ description = e.getMessage();
+ throw e;
}
+ } finally {
+ w.flush();
}
- catch (SAXException e)
- {
- resultCode =
e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
- description = e.getMessage();
- int rval = handleSaxException(e);
- if (rval == DOCUMENTSTATUS_REJECTED)
- activities.noDocument();
- return rval;
- }
- catch (IOException e)
- {
- resultCode =
e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
- description = e.getMessage();
- throw e;
- }
- }
- finally
- {
- w.flush();
+ } finally {
+ os.close();
+ length = new Long(ds.getBinaryLength());
}
}
- finally
- {
- os.close();
- length = new Long(ds.getBinaryLength());
- }
-
- // Check to be sure downstream pipeline will accept document of
specified length
- if (!activities.checkLengthIndexable(ds.getBinaryLength()))
- {
+
+ if (!activities.checkLengthIndexable(ds.getBinaryLength())) {
activities.noDocument();
resultCode = activities.EXCLUDED_LENGTH;
- description = "Downstream pipeline rejected document with length
"+ds.getBinaryLength();
+ description = "Downstream pipeline rejected document with length " +
ds.getBinaryLength();
return DOCUMENTSTATUS_REJECTED;
}
- }
- finally
- {
+ } finally {
// Log the extraction processing
- activities.recordActivity(new Long(startTime), ACTIVITY_EXTRACT,
length, documentURI,
- resultCode, description);
+ activities.recordActivity(new Long(startTime), ACTIVITY_EXTRACT,
length, documentURI, resultCode, description);
}
-
+
// Parsing complete!
// Create a copy of Repository Document
RepositoryDocument docCopy = document.duplicate();
-
+
// Get new stream length
long newBinaryLength = ds.getBinaryLength();
// Open new input stream
InputStream is = ds.getInputStream();
- try
- {
- docCopy.setBinary(is,newBinaryLength);
- // Set up all metadata from Tika. We may want to run this through a
mapper eventually...
+ try {
+ docCopy.setBinary(is, newBinaryLength);
+
+ // Set up all metadata from Tika. We may want to run this through a
+ // mapper eventually...
String[] metaNames = metadata.names();
- for(String mName : metaNames){
+ for (String mName : metaNames) {
String value = metadata.get(mName);
- if (sp.lowerNames())
- {
+ if (sp.lowerNames()) {
StringBuilder sb = new StringBuilder();
- for (int i=0; i<mName.length(); i++) {
+ for (int i = 0; i < mName.length(); i++) {
char ch = mName.charAt(i);
- if (!Character.isLetterOrDigit(ch)) ch='_';
- else ch=Character.toLowerCase(ch);
+ if (!Character.isLetterOrDigit(ch))
+ ch = '_';
+ else
+ ch = Character.toLowerCase(ch);
sb.append(ch);
}
mName = sb.toString();
}
String target = sp.getMapping(mName);
- if(target!=null)
- {
+ if (target != null) {
docCopy.addField(target, value);
- }
- else
- {
- if(sp.keepAllMetadata())
- {
- docCopy.addField(mName, value);
+ } else {
+ if (sp.keepAllMetadata()) {
+ docCopy.addField(mName, value);
}
}
}
// Send new document downstream
- return activities.sendDocument(documentURI,docCopy);
- }
- finally
- {
+ return activities.sendDocument(documentURI, docCopy);
+ } finally {
is.close();
}
- }
- finally
- {
+ } finally {
ds.close();
}
}
- /** Obtain the name of the form check javascript method to call.
- *@param connectionSequenceNumber is the unique number of this connection
within the job.
- *@return the name of the form check javascript method.
- */
- @Override
- public String getFormCheckJavascriptMethodName(int connectionSequenceNumber)
- {
- return "s"+connectionSequenceNumber+"_checkSpecification";
- }
-
- /** Obtain the name of the form presave check javascript method to call.
- *@param connectionSequenceNumber is the unique number of this connection
within the job.
- *@return the name of the form presave check javascript method.
- */
- @Override
- public String getFormPresaveCheckJavascriptMethodName(int
connectionSequenceNumber)
- {
- return "s"+connectionSequenceNumber+"_checkSpecificationForSave";
- }
-
- /** Output the specification header section.
- * This method is called in the head section of a job page which has selected
a pipeline connection of the current type. Its purpose is to add the required
tabs
- * to the list, and to output any javascript methods that might be needed by
the job editing HTML.
- *@param out is the output to which any HTML should be sent.
- *@param locale is the preferred local of the output.
- *@param os is the current pipeline specification for this connection.
- *@param connectionSequenceNumber is the unique number of this connection
within the job.
- *@param tabsArray is an array of tab names. Add to this array any tab names
that are specific to the connector.
- */
- @Override
- public void outputSpecificationHeader(IHTTPOutput out, Locale locale,
Specification os,
- int connectionSequenceNumber, List<String> tabsArray)
- throws ManifoldCFException, IOException
- {
+ /**
+ * Obtain the name of the form check javascript method to call.
+ *
+ * @param connectionSequenceNumber
+ * is the unique number of this connection within the job.
+ * @return the name of the form check javascript method.
+ */
+ @Override
+ public String getFormCheckJavascriptMethodName(int connectionSequenceNumber)
{
+ return "s" + connectionSequenceNumber + "_checkSpecification";
+ }
+
+ /**
+ * Obtain the name of the form presave check javascript method to call.
+ *
+ * @param connectionSequenceNumber
+ * is the unique number of this connection within the job.
+ * @return the name of the form presave check javascript method.
+ */
+ @Override
+ public String getFormPresaveCheckJavascriptMethodName(int
connectionSequenceNumber) {
+ return "s" + connectionSequenceNumber + "_checkSpecificationForSave";
+ }
+
+ /**
+ * Output the specification header section. This method is called in the head
+ * section of a job page which has selected a pipeline connection of the
+ * current type. Its purpose is to add the required tabs to the list, and to
+ * output any javascript methods that might be needed by the job editing
HTML.
+ *
+ * @param out
+ * is the output to which any HTML should be sent.
+ * @param locale
+ * is the preferred local of the output.
+ * @param os
+ * is the current pipeline specification for this connection.
+ * @param connectionSequenceNumber
+ * is the unique number of this connection within the job.
+ * @param tabsArray
+ * is an array of tab names. Add to this array any tab names that
are
+ * specific to the connector.
+ */
+ @Override
+ public void outputSpecificationHeader(IHTTPOutput out, Locale locale,
Specification os, int connectionSequenceNumber,
+ List<String> tabsArray) throws ManifoldCFException, IOException {
Map<String, Object> paramMap = new HashMap<String, Object>();
- paramMap.put("SEQNUM",Integer.toString(connectionSequenceNumber));
+ paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
+ tabsArray.add(Messages.getString(locale, "TikaExtractor.TikaTypeTabName"));
tabsArray.add(Messages.getString(locale,
"TikaExtractor.FieldMappingTabName"));
tabsArray.add(Messages.getString(locale,
"TikaExtractor.ExceptionsTabName"));
tabsArray.add(Messages.getString(locale,
"TikaExtractor.BoilerplateTabName"));
// Fill in the specification header map, using data from all tabs.
+ fillInTikaTypeSpecificationMap(paramMap, os);
fillInFieldMappingSpecificationMap(paramMap, os);
fillInExceptionsSpecificationMap(paramMap, os);
fillInBoilerplateSpecificationMap(paramMap, os);
-
-
Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_JS,paramMap);
+
+ Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_JS,
paramMap);
}
-
- /** Output the specification body section.
- * This method is called in the body section of a job page which has selected
a pipeline connection of the current type. Its purpose is to present the
required form elements for editing.
- * The coder can presume that the HTML that is output from this configuration
will be within appropriate <html>, <body>, and <form> tags. The name of the
- * form is "editjob".
- *@param out is the output to which any HTML should be sent.
- *@param locale is the preferred local of the output.
- *@param os is the current pipeline specification for this job.
- *@param connectionSequenceNumber is the unique number of this connection
within the job.
- *@param actualSequenceNumber is the connection within the job that has
currently been selected.
- *@param tabName is the current tab name.
- */
- @Override
- public void outputSpecificationBody(IHTTPOutput out, Locale locale,
Specification os,
- int connectionSequenceNumber, int actualSequenceNumber, String tabName)
- throws ManifoldCFException, IOException
- {
+
+ /**
+ * Output the specification body section. This method is called in the body
+ * section of a job page which has selected a pipeline connection of the
+ * current type. Its purpose is to present the required form elements for
+ * editing. The coder can presume that the HTML that is output from this
+ * configuration will be within appropriate <html>, <body>, and <form> tags.
+ * The name of the form is "editjob".
+ *
+ * @param out
+ * is the output to which any HTML should be sent.
+ * @param locale
+ * is the preferred local of the output.
+ * @param os
+ * is the current pipeline specification for this job.
+ * @param connectionSequenceNumber
+ * is the unique number of this connection within the job.
+ * @param actualSequenceNumber
+ * is the connection within the job that has currently been
selected.
+ * @param tabName
+ * is the current tab name.
+ */
+ @Override
+ public void outputSpecificationBody(IHTTPOutput out, Locale locale,
Specification os, int connectionSequenceNumber,
+ int actualSequenceNumber, String tabName) throws ManifoldCFException,
IOException {
Map<String, Object> paramMap = new HashMap<String, Object>();
// Set the tab name
paramMap.put("TABNAME", tabName);
- paramMap.put("SEQNUM",Integer.toString(connectionSequenceNumber));
- paramMap.put("SELECTEDNUM",Integer.toString(actualSequenceNumber));
+ paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
+ paramMap.put("SELECTEDNUM", Integer.toString(actualSequenceNumber));
// Fill in the field mapping tab data
+ fillInTikaTypeSpecificationMap(paramMap, os);
fillInFieldMappingSpecificationMap(paramMap, os);
fillInExceptionsSpecificationMap(paramMap, os);
fillInBoilerplateSpecificationMap(paramMap, os);
-
-
Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_FIELDMAPPING_HTML,paramMap);
-
Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_EXCEPTIONS_HTML,paramMap);
-
Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_BOILERPLATE_HTML,paramMap);
- }
-
- /** Process a specification post.
- * This method is called at the start of job's edit or view page, whenever
there is a possibility that form data for a connection has been
- * posted. Its purpose is to gather form information and modify the
transformation specification accordingly.
- * The name of the posted form is "editjob".
- *@param variableContext contains the post data, including binary file-upload
information.
- *@param locale is the preferred local of the output.
- *@param os is the current pipeline specification for this job.
- *@param connectionSequenceNumber is the unique number of this connection
within the job.
- *@return null if all is well, or a string error message if there is an error
that should prevent saving of the job (and cause a redirection to an error
page).
- */
+
+ Messages.outputResourceWithVelocity(out, locale,
EDIT_SPECIFICATION_TIKATYPE_HTML, paramMap);
+ Messages.outputResourceWithVelocity(out, locale,
EDIT_SPECIFICATION_FIELDMAPPING_HTML, paramMap);
+ Messages.outputResourceWithVelocity(out, locale,
EDIT_SPECIFICATION_EXCEPTIONS_HTML, paramMap);
+ Messages.outputResourceWithVelocity(out, locale,
EDIT_SPECIFICATION_BOILERPLATE_HTML, paramMap);
+ }
+
+ /**
+ * Process a specification post. This method is called at the start of job's
+ * edit or view page, whenever there is a possibility that form data for a
+ * connection has been posted. Its purpose is to gather form information and
+ * modify the transformation specification accordingly. The name of the
posted
+ * form is "editjob".
+ *
+ * @param variableContext
+ * contains the post data, including binary file-upload information.
+ * @param locale
+ * is the preferred local of the output.
+ * @param os
+ * is the current pipeline specification for this job.
+ * @param connectionSequenceNumber
+ * is the unique number of this connection within the job.
+ * @return null if all is well, or a string error message if there is an
error
+ * that should prevent saving of the job (and cause a redirection to
+ * an error page).
+ */
@Override
public String processSpecificationPost(IPostParameters variableContext,
Locale locale, Specification os,
- int connectionSequenceNumber)
- throws ManifoldCFException {
- String seqPrefix = "s"+connectionSequenceNumber+"_";
+ int connectionSequenceNumber) throws ManifoldCFException {
+ String seqPrefix = "s" + connectionSequenceNumber + "_";
String x;
-
- x = variableContext.getParameter(seqPrefix+"fieldmapping_count");
- if (x != null && x.length() > 0)
- {
+
+ x = variableContext.getParameter(seqPrefix + "fieldmapping_count");
+ if (x != null && x.length() > 0) {
// About to gather the fieldmapping nodes, so get rid of the old ones.
int i = 0;
- while (i < os.getChildCount())
- {
+ while (i < os.getChildCount()) {
SpecificationNode node = os.getChild(i);
- if (node.getType().equals(TikaConfig.NODE_FIELDMAP)
- || node.getType().equals(TikaConfig.NODE_KEEPMETADATA)
- || node.getType().equals(TikaConfig.NODE_LOWERNAMES)
- || node.getType().equals(TikaConfig.NODE_WRITELIMIT))
+ if (node.getType().equals(TikaConfig.NODE_FIELDMAP) ||
node.getType().equals(TikaConfig.NODE_KEEPMETADATA)
+ || node.getType().equals(TikaConfig.NODE_LOWERNAMES) ||
node.getType().equals(TikaConfig.NODE_WRITELIMIT))
os.removeChild(i);
else
i++;
}
int count = Integer.parseInt(x);
i = 0;
- while (i < count)
- {
- String prefix = seqPrefix+"fieldmapping_";
- String suffix = "_"+Integer.toString(i);
- String op = variableContext.getParameter(prefix+"op"+suffix);
- if (op == null || !op.equals("Delete"))
- {
+ while (i < count) {
+ String prefix = seqPrefix + "fieldmapping_";
+ String suffix = "_" + Integer.toString(i);
+ String op = variableContext.getParameter(prefix + "op" + suffix);
+ if (op == null || !op.equals("Delete")) {
// Gather the fieldmap etc.
- String source = variableContext.getParameter(prefix+"source"+suffix);
- String target = variableContext.getParameter(prefix+"target"+suffix);
+ String source = variableContext.getParameter(prefix + "source" +
suffix);
+ String target = variableContext.getParameter(prefix + "target" +
suffix);
if (target == null)
target = "";
SpecificationNode node = new
SpecificationNode(TikaConfig.NODE_FIELDMAP);
- node.setAttribute(TikaConfig.ATTRIBUTE_SOURCE,source);
- node.setAttribute(TikaConfig.ATTRIBUTE_TARGET,target);
- os.addChild(os.getChildCount(),node);
+ node.setAttribute(TikaConfig.ATTRIBUTE_SOURCE, source);
+ node.setAttribute(TikaConfig.ATTRIBUTE_TARGET, target);
+ os.addChild(os.getChildCount(), node);
}
i++;
}
-
- String addop = variableContext.getParameter(seqPrefix+"fieldmapping_op");
- if (addop != null && addop.equals("Add"))
- {
- String source =
variableContext.getParameter(seqPrefix+"fieldmapping_source");
- String target =
variableContext.getParameter(seqPrefix+"fieldmapping_target");
+
+ String addop = variableContext.getParameter(seqPrefix +
"fieldmapping_op");
+ if (addop != null && addop.equals("Add")) {
+ String source = variableContext.getParameter(seqPrefix +
"fieldmapping_source");
+ String target = variableContext.getParameter(seqPrefix +
"fieldmapping_target");
if (target == null)
target = "";
SpecificationNode node = new
SpecificationNode(TikaConfig.NODE_FIELDMAP);
- node.setAttribute(TikaConfig.ATTRIBUTE_SOURCE,source);
- node.setAttribute(TikaConfig.ATTRIBUTE_TARGET,target);
- os.addChild(os.getChildCount(),node);
+ node.setAttribute(TikaConfig.ATTRIBUTE_SOURCE, source);
+ node.setAttribute(TikaConfig.ATTRIBUTE_TARGET, target);
+ os.addChild(os.getChildCount(), node);
}
-
+
// Gather the keep all metadata parameter to be the last one
SpecificationNode node = new
SpecificationNode(TikaConfig.NODE_KEEPMETADATA);
- String keepAll =
variableContext.getParameter(seqPrefix+"keepallmetadata");
- if (keepAll != null)
- {
+ String keepAll = variableContext.getParameter(seqPrefix +
"keepallmetadata");
+ if (keepAll != null) {
node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, keepAll);
- }
- else
- {
+ } else {
node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "false");
}
- // Add the new keepallmetadata config parameter
+ // Add the new keepallmetadata config parameter
os.addChild(os.getChildCount(), node);
-
+
SpecificationNode node2 = new
SpecificationNode(TikaConfig.NODE_LOWERNAMES);
- String lower = variableContext.getParameter(seqPrefix+"lowernames");
- if (lower != null)
- {
+ String lower = variableContext.getParameter(seqPrefix + "lowernames");
+ if (lower != null) {
node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, lower);
- }
- else
- {
+ } else {
node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "false");
}
os.addChild(os.getChildCount(), node2);
-
+
SpecificationNode node3 = new
SpecificationNode(TikaConfig.NODE_WRITELIMIT);
- String writeLimit = variableContext.getParameter(seqPrefix+"writelimit");
- if (writeLimit != null)
- {
+ String writeLimit = variableContext.getParameter(seqPrefix +
"writelimit");
+ if (writeLimit != null) {
node3.setAttribute(TikaConfig.ATTRIBUTE_VALUE, writeLimit);
- }
- else
- {
+ } else {
node3.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "");
}
os.addChild(os.getChildCount(), node3);
}
-
- if (variableContext.getParameter(seqPrefix+"ignoretikaexceptions_present")
!= null)
- {
+
+ if (variableContext.getParameter(seqPrefix +
"ignoretikaexceptions_present") != null) {
int i = 0;
- while (i < os.getChildCount())
- {
+ while (i < os.getChildCount()) {
SpecificationNode node = os.getChild(i);
if (node.getType().equals(TikaConfig.NODE_IGNORETIKAEXCEPTION))
os.removeChild(i);
@@ -545,7 +761,7 @@ public class TikaExtractor extends org.a
i++;
}
- String value =
variableContext.getParameter(seqPrefix+"ignoretikaexceptions");
+ String value = variableContext.getParameter(seqPrefix +
"ignoretikaexceptions");
if (value == null)
value = "false";
@@ -553,13 +769,11 @@ public class TikaExtractor extends org.a
node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, value);
os.addChild(os.getChildCount(), node);
}
-
- x = variableContext.getParameter(seqPrefix+"boilerplateclassname");
- if (x != null)
- {
+
+ x = variableContext.getParameter(seqPrefix + "boilerplateclassname");
+ if (x != null) {
int i = 0;
- while (i < os.getChildCount())
- {
+ while (i < os.getChildCount()) {
SpecificationNode node = os.getChild(i);
if (node.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR))
os.removeChild(i);
@@ -567,183 +781,265 @@ public class TikaExtractor extends org.a
i++;
}
- if (x.length() > 0)
- {
+ if (x.length() > 0) {
SpecificationNode node = new
SpecificationNode(TikaConfig.NODE_BOILERPLATEPROCESSOR);
node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, x);
os.addChild(os.getChildCount(), node);
}
}
-
+
+ x = variableContext.getParameter(seqPrefix + "tikaserver");
+ if (x != null) {
+ int i = 0;
+ while (i < os.getChildCount()) {
+ SpecificationNode node = os.getChild(i);
+ if (node.getType().equals(TikaConfig.NODE_TIKASERVER) ||
node.getType().equals(TikaConfig.NODE_TIKAHOSTNAME)
+ || node.getType().equals(TikaConfig.NODE_TIKAPORT) ||
node.getType().equals(TikaConfig.NODE_TIKARETRY))
+ os.removeChild(i);
+ else
+ i++;
+ }
+
+ SpecificationNode node = new
SpecificationNode(TikaConfig.NODE_TIKASERVER);
+ String tikaServer = variableContext.getParameter(seqPrefix +
"tikaserver");
+ if (tikaServer != null) {
+ node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, tikaServer);
+ } else {
+ node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "false");
+ }
+ // Add the new tikaserver config parameter
+ os.addChild(os.getChildCount(), node);
+
+ SpecificationNode node2 = new
SpecificationNode(TikaConfig.NODE_TIKAHOSTNAME);
+ String tikaHostname = variableContext.getParameter(seqPrefix +
"tikahostname");
+ if (tikaHostname != null) {
+ node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, tikaHostname);
+ } else {
+ node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "");
+ }
+ // Add the new tikahostname config parameter
+ os.addChild(os.getChildCount(), node2);
+
+ SpecificationNode node3 = new
SpecificationNode(TikaConfig.NODE_TIKAPORT);
+ String tikaPort = variableContext.getParameter(seqPrefix + "tikaport");
+ if (tikaPort != null) {
+ node3.setAttribute(TikaConfig.ATTRIBUTE_VALUE, tikaPort);
+ } else {
+ node3.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "");
+ }
+ // Add the new tikaport config parameter
+ os.addChild(os.getChildCount(), node3);
+
+ SpecificationNode node4 = new
SpecificationNode(TikaConfig.NODE_TIKARETRY);
+ String tikaRetry = variableContext.getParameter(seqPrefix + "tikaretry");
+ if (tikaRetry != null) {
+ node4.setAttribute(TikaConfig.ATTRIBUTE_VALUE, tikaRetry);
+ } else {
+ node4.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "");
+ }
+ // Add the new tikaport config parameter
+ os.addChild(os.getChildCount(), node4);
+ }
+
return null;
}
-
- /** View specification.
- * This method is called in the body section of a job's view page. Its
purpose is to present the pipeline specification information to the user.
- * The coder can presume that the HTML that is output from this configuration
will be within appropriate <html> and <body> tags.
- *@param out is the output to which any HTML should be sent.
- *@param locale is the preferred local of the output.
- *@param connectionSequenceNumber is the unique number of this connection
within the job.
- *@param os is the current pipeline specification for this job.
- */
- @Override
- public void viewSpecification(IHTTPOutput out, Locale locale, Specification
os,
- int connectionSequenceNumber)
- throws ManifoldCFException, IOException
- {
+ /**
+ * View specification. This method is called in the body section of a job's
+ * view page. Its purpose is to present the pipeline specification
information
+ * to the user. The coder can presume that the HTML that is output from this
+ * configuration will be within appropriate <html> and <body> tags.
+ *
+ * @param out
+ * is the output to which any HTML should be sent.
+ * @param locale
+ * is the preferred local of the output.
+ * @param connectionSequenceNumber
+ * is the unique number of this connection within the job.
+ * @param os
+ * is the current pipeline specification for this job.
+ */
+ @Override
+ public void viewSpecification(IHTTPOutput out, Locale locale, Specification
os, int connectionSequenceNumber)
+ throws ManifoldCFException, IOException {
Map<String, Object> paramMap = new HashMap<String, Object>();
- paramMap.put("SEQNUM",Integer.toString(connectionSequenceNumber));
+ paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
// Fill in the map with data from all tabs
+ fillInTikaTypeSpecificationMap(paramMap, os);
fillInFieldMappingSpecificationMap(paramMap, os);
fillInExceptionsSpecificationMap(paramMap, os);
fillInBoilerplateSpecificationMap(paramMap, os);
-
Messages.outputResourceWithVelocity(out,locale,VIEW_SPECIFICATION_HTML,paramMap);
-
+ Messages.outputResourceWithVelocity(out, locale, VIEW_SPECIFICATION_HTML,
paramMap);
+
+ }
+
+ protected static void fillInTikaTypeSpecificationMap(Map<String, Object>
paramMap, Specification os) {
+ String tikaServer = "false";
+ String tikaHostname = TikaConfig.TIKAHOSTNAME_DEFAULT;
+ String tikaPort = String.valueOf(TikaConfig.TIKAPORT_DEFAULT);
+ String tikaRetry = String.valueOf(TikaConfig.TIKARETRY_DEFAULT);
+ for (int i = 0; i < os.getChildCount(); i++) {
+ SpecificationNode sn = os.getChild(i);
+ if (sn.getType().equals(TikaConfig.NODE_TIKASERVER)) {
+ tikaServer = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+ } else if (sn.getType().equals(TikaConfig.NODE_TIKAHOSTNAME)) {
+ tikaHostname = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+ } else if (sn.getType().equals(TikaConfig.NODE_TIKAPORT)) {
+ tikaPort = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+ } else if (sn.getType().equals(TikaConfig.NODE_TIKARETRY)) {
+ tikaRetry = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+ }
+ }
+ paramMap.put("TIKASERVER", tikaServer);
+ paramMap.put("TIKAHOSTNAME", tikaHostname);
+ paramMap.put("TIKAPORT", tikaPort);
+ paramMap.put("TIKARETRY", tikaRetry);
}
- protected static void fillInFieldMappingSpecificationMap(Map<String,Object>
paramMap, Specification os)
- {
+ protected static void fillInFieldMappingSpecificationMap(Map<String, Object>
paramMap, Specification os) {
// Prep for field mappings
- List<Map<String,String>> fieldMappings = new
ArrayList<Map<String,String>>();
+ List<Map<String, String>> fieldMappings = new ArrayList<Map<String,
String>>();
String keepAllMetadataValue = "true";
String lowernamesValue = "false";
String writeLimitValue = "";
- for (int i = 0; i < os.getChildCount(); i++)
- {
+ for (int i = 0; i < os.getChildCount(); i++) {
SpecificationNode sn = os.getChild(i);
if (sn.getType().equals(TikaConfig.NODE_FIELDMAP)) {
String source = sn.getAttributeValue(TikaConfig.ATTRIBUTE_SOURCE);
String target = sn.getAttributeValue(TikaConfig.ATTRIBUTE_TARGET);
String targetDisplay;
- if (target == null)
- {
+ if (target == null) {
target = "";
targetDisplay = "(remove)";
- }
- else
+ } else
targetDisplay = target;
- Map<String,String> fieldMapping = new HashMap<String,String>();
- fieldMapping.put("SOURCE",source);
- fieldMapping.put("TARGET",target);
- fieldMapping.put("TARGETDISPLAY",targetDisplay);
+ Map<String, String> fieldMapping = new HashMap<String, String>();
+ fieldMapping.put("SOURCE", source);
+ fieldMapping.put("TARGET", target);
+ fieldMapping.put("TARGETDISPLAY", targetDisplay);
fieldMappings.add(fieldMapping);
- }
- else if (sn.getType().equals(TikaConfig.NODE_KEEPMETADATA))
- {
+ } else if (sn.getType().equals(TikaConfig.NODE_KEEPMETADATA)) {
keepAllMetadataValue =
sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
- }
- else if (sn.getType().equals(TikaConfig.NODE_LOWERNAMES))
- {
+ } else if (sn.getType().equals(TikaConfig.NODE_LOWERNAMES)) {
lowernamesValue = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
- }
- else if (sn.getType().equals(TikaConfig.NODE_WRITELIMIT))
- {
+ } else if (sn.getType().equals(TikaConfig.NODE_WRITELIMIT)) {
writeLimitValue = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
}
}
- paramMap.put("FIELDMAPPINGS",fieldMappings);
- paramMap.put("KEEPALLMETADATA",keepAllMetadataValue);
- paramMap.put("LOWERNAMES",lowernamesValue);
- paramMap.put("WRITELIMIT",writeLimitValue);
+ paramMap.put("FIELDMAPPINGS", fieldMappings);
+ paramMap.put("KEEPALLMETADATA", keepAllMetadataValue);
+ paramMap.put("LOWERNAMES", lowernamesValue);
+ paramMap.put("WRITELIMIT", writeLimitValue);
}
- protected static void fillInExceptionsSpecificationMap(Map<String,Object>
paramMap, Specification os)
- {
+ protected static void fillInExceptionsSpecificationMap(Map<String, Object>
paramMap, Specification os) {
String ignoreTikaExceptions = "true";
- for (int i = 0; i < os.getChildCount(); i++)
- {
+ for (int i = 0; i < os.getChildCount(); i++) {
SpecificationNode sn = os.getChild(i);
- if (sn.getType().equals(TikaConfig.NODE_IGNORETIKAEXCEPTION))
- {
+ if (sn.getType().equals(TikaConfig.NODE_IGNORETIKAEXCEPTION)) {
ignoreTikaExceptions =
sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
}
}
- paramMap.put("IGNORETIKAEXCEPTIONS",ignoreTikaExceptions);
+ paramMap.put("IGNORETIKAEXCEPTIONS", ignoreTikaExceptions);
}
- protected static void fillInBoilerplateSpecificationMap(Map<String,Object>
paramMap, Specification os)
- {
+ protected static void fillInBoilerplateSpecificationMap(Map<String, Object>
paramMap, Specification os) {
String boilerplateClassName = "";
- for (int i = 0; i < os.getChildCount(); i++)
- {
+ for (int i = 0; i < os.getChildCount(); i++) {
SpecificationNode sn = os.getChild(i);
- if (sn.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR))
- {
+ if (sn.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR)) {
boilerplateClassName =
sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
}
}
- paramMap.put("BOILERPLATECLASSNAME",boilerplateClassName);
+ paramMap.put("BOILERPLATECLASSNAME", boilerplateClassName);
}
protected static int handleTikaException(TikaException e)
- throws IOException, ManifoldCFException, ServiceInterruption
- {
+ throws IOException, ManifoldCFException, ServiceInterruption {
+ // MHL - what does Tika throw if it gets an IOException reading the
stream??
+ Logging.ingest.warn("Tika: Tika exception extracting: " + e.getMessage(),
e);
+ return DOCUMENTSTATUS_REJECTED;
+ }
+
+ protected static int handleTikaServerRejects(String reason)
+ throws IOException, ManifoldCFException, ServiceInterruption {
+ // MHL - what does Tika throw if it gets an IOException reading the
stream??
+ Logging.ingest.warn("Tika Server: Tika Server rejects: " + reason);
+ return DOCUMENTSTATUS_REJECTED;
+ }
+
+ protected static int handleTikaServerError(String description)
+ throws IOException, ManifoldCFException, ServiceInterruption {
+ // MHL - what does Tika throw if it gets an IOException reading the
stream??
+ Logging.ingest.warn("Tika Server: Tika Server error: " + description);
+ return DOCUMENTSTATUS_REJECTED;
+ }
+
+ protected static int handleTikaServerException(IOException e)
+ throws IOException, ManifoldCFException, ServiceInterruption {
// MHL - what does Tika throw if it gets an IOException reading the
stream??
- Logging.ingest.warn("Tika: Tika exception extracting: "+e.getMessage(),e);
+ Logging.ingest.warn("Tika: Tika exception extracting: " + e.getMessage(),
e);
return DOCUMENTSTATUS_REJECTED;
}
-
- protected static int handleSaxException(SAXException e)
- throws IOException, ManifoldCFException, ServiceInterruption
- {
+
+ protected static int handleTikaServerException(ParseException e)
+ throws IOException, ManifoldCFException, ServiceInterruption {
+ // MHL - what does Tika throw if it gets an IOException reading the
stream??
+ Logging.ingest.warn("Tika: Tika exception extracting: " + e.getMessage(),
e);
+ return DOCUMENTSTATUS_REJECTED;
+ }
+
+ protected static int handleSaxException(SAXException e) throws IOException,
ManifoldCFException, ServiceInterruption {
// MHL - what does this mean?
- Logging.ingest.warn("Tika: SAX exception extracting: "+e.getMessage(),e);
+ Logging.ingest.warn("Tika: SAX exception extracting: " + e.getMessage(),
e);
return DOCUMENTSTATUS_REJECTED;
}
-
- protected static int handleIOException(IOException e)
- throws ManifoldCFException
- {
+
+ protected static int handleIOException(IOException e) throws
ManifoldCFException {
// IOException reading from our local storage...
if (e instanceof InterruptedIOException)
- throw new
ManifoldCFException(e.getMessage(),e,ManifoldCFException.INTERRUPTED);
- throw new ManifoldCFException(e.getMessage(),e);
+ throw new ManifoldCFException(e.getMessage(), e,
ManifoldCFException.INTERRUPTED);
+ throw new ManifoldCFException(e.getMessage(), e);
}
-
- protected static interface DestinationStorage
- {
- /** Get the output stream to write to. Caller should explicitly close
this stream when done writing.
- */
- public OutputStream getOutputStream()
- throws ManifoldCFException;
-
- /** Get new binary length.
- */
- public long getBinaryLength()
- throws ManifoldCFException;
-
- /** Get the input stream to read from. Caller should explicitly close
this stream when done reading.
- */
- public InputStream getInputStream()
- throws ManifoldCFException;
-
- /** Close the object and clean up everything.
- * This should be called when the data is no longer needed.
- */
- public void close()
- throws ManifoldCFException;
- }
-
- protected static class FileDestinationStorage implements DestinationStorage
- {
+
+ protected static interface DestinationStorage {
+ /**
+ * Get the output stream to write to. Caller should explicitly close this
+ * stream when done writing.
+ */
+ public OutputStream getOutputStream() throws ManifoldCFException;
+
+ /**
+ * Get new binary length.
+ */
+ public long getBinaryLength() throws ManifoldCFException;
+
+ /**
+ * Get the input stream to read from. Caller should explicitly close this
+ * stream when done reading.
+ */
+ public InputStream getInputStream() throws ManifoldCFException;
+
+ /**
+ * Close the object and clean up everything. This should be called when the
+ * data is no longer needed.
+ */
+ public void close() throws ManifoldCFException;
+ }
+
+ protected static class FileDestinationStorage implements DestinationStorage {
protected final File outputFile;
protected final OutputStream outputStream;
- public FileDestinationStorage()
- throws ManifoldCFException
- {
+ public FileDestinationStorage() throws ManifoldCFException {
File outputFile;
OutputStream outputStream;
- try
- {
- outputFile = File.createTempFile("mcftika","tmp");
+ try {
+ outputFile = File.createTempFile("mcftika", "tmp");
outputStream = new FileOutputStream(outputFile);
- }
- catch (IOException e)
- {
+ } catch (IOException e) {
handleIOException(e);
outputFile = null;
outputStream = null;
@@ -751,121 +1047,127 @@ public class TikaExtractor extends org.a
this.outputFile = outputFile;
this.outputStream = outputStream;
}
-
+
@Override
- public OutputStream getOutputStream()
- throws ManifoldCFException
- {
+ public OutputStream getOutputStream() throws ManifoldCFException {
return outputStream;
}
-
- /** Get new binary length.
- */
+
+ /**
+ * Get new binary length.
+ */
@Override
- public long getBinaryLength()
- throws ManifoldCFException
- {
+ public long getBinaryLength() throws ManifoldCFException {
return outputFile.length();
}
- /** Get the input stream to read from. Caller should explicitly close
this stream when done reading.
- */
+ /**
+ * Get the input stream to read from. Caller should explicitly close this
+ * stream when done reading.
+ */
@Override
- public InputStream getInputStream()
- throws ManifoldCFException
- {
- try
- {
+ public InputStream getInputStream() throws ManifoldCFException {
+ try {
return new FileInputStream(outputFile);
- }
- catch (IOException e)
- {
+ } catch (IOException e) {
handleIOException(e);
return null;
}
}
-
- /** Close the object and clean up everything.
- * This should be called when the data is no longer needed.
- */
+
+ /**
+ * Close the object and clean up everything. This should be called when the
+ * data is no longer needed.
+ */
@Override
- public void close()
- throws ManifoldCFException
- {
+ public void close() throws ManifoldCFException {
outputFile.delete();
}
}
-
- protected static class MemoryDestinationStorage implements DestinationStorage
- {
+
+ protected static class MemoryDestinationStorage implements
DestinationStorage {
protected final ByteArrayOutputStream outputStream;
-
- public MemoryDestinationStorage(int sizeHint)
- {
+
+ public MemoryDestinationStorage(int sizeHint) {
outputStream = new ByteArrayOutputStream(sizeHint);
}
-
+
@Override
- public OutputStream getOutputStream()
- throws ManifoldCFException
- {
+ public OutputStream getOutputStream() throws ManifoldCFException {
return outputStream;
}
- /** Get new binary length.
- */
+ /**
+ * Get new binary length.
+ */
@Override
- public long getBinaryLength()
- throws ManifoldCFException
- {
+ public long getBinaryLength() throws ManifoldCFException {
return outputStream.size();
}
-
- /** Get the input stream to read from. Caller should explicitly close
this stream when done reading.
- */
+
+ /**
+ * Get the input stream to read from. Caller should explicitly close this
+ * stream when done reading.
+ */
@Override
- public InputStream getInputStream()
- throws ManifoldCFException
- {
+ public InputStream getInputStream() throws ManifoldCFException {
return new ByteArrayInputStream(outputStream.toByteArray());
}
-
- /** Close the object and clean up everything.
- * This should be called when the data is no longer needed.
- */
- public void close()
- throws ManifoldCFException
- {
+
+ /**
+ * Close the object and clean up everything. This should be called when the
+ * data is no longer needed.
+ */
+ public void close() throws ManifoldCFException {
}
}
protected static class SpecPacker {
-
- private final Map<String,String> sourceTargets = new
HashMap<String,String>();
+
+ private final Map<String, String> sourceTargets = new HashMap<String,
String>();
private final boolean keepAllMetadata;
private final boolean lowerNames;
private final int writeLimit;
private final boolean ignoreTikaException;
private final String extractorClassName;
-
+ private URI metaURI;
+ private URI contentURI;
+ private final String tikaHostname;
+ private final int tikaPort;
+ private final boolean tikaServer;
+ private final long tikaRetry;
+
public SpecPacker(Specification os) {
boolean keepAllMetadata = true;
boolean lowerNames = false;
int writeLimit = TikaConfig.WRITELIMIT_DEFAULT;
boolean ignoreTikaException = true;
String extractorClassName = null;
+ String tikaHostname = TikaConfig.TIKAHOSTNAME_DEFAULT;
+ int tikaPort = TikaConfig.TIKAPORT_DEFAULT;
+ boolean tikaServer = false;
+ long tikaRetry = TikaConfig.TIKARETRY_DEFAULT;
+ try {
+ metaURI = new URI("/meta");
+ contentURI = new URI("/tika");
+ } catch (URISyntaxException e) {
+ // Should be impossible
+ metaURI = null;
+ contentURI = null;
+ }
+
for (int i = 0; i < os.getChildCount(); i++) {
SpecificationNode sn = os.getChild(i);
-
- if(sn.getType().equals(TikaConfig.NODE_KEEPMETADATA)) {
+
+ if (sn.getType().equals(TikaConfig.NODE_KEEPMETADATA)) {
String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
keepAllMetadata = Boolean.parseBoolean(value);
- } else if(sn.getType().equals(TikaConfig.NODE_LOWERNAMES)) {
+ } else if (sn.getType().equals(TikaConfig.NODE_LOWERNAMES)) {
String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
lowerNames = Boolean.parseBoolean(value);
- } else if(sn.getType().equals(TikaConfig.NODE_WRITELIMIT)) {
+ } else if (sn.getType().equals(TikaConfig.NODE_WRITELIMIT)) {
String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
if (value.length() == 0) {
writeLimit = TikaConfig.WRITELIMIT_DEFAULT;
@@ -875,7 +1177,7 @@ public class TikaExtractor extends org.a
} else if (sn.getType().equals(TikaConfig.NODE_FIELDMAP)) {
String source = sn.getAttributeValue(TikaConfig.ATTRIBUTE_SOURCE);
String target = sn.getAttributeValue(TikaConfig.ATTRIBUTE_TARGET);
-
+
if (target == null) {
target = "";
}
@@ -885,6 +1187,34 @@ public class TikaExtractor extends org.a
ignoreTikaException = Boolean.parseBoolean(value);
} else if (sn.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR)) {
extractorClassName =
sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+ } else if (sn.getType().equals(TikaConfig.NODE_TIKAHOSTNAME)) {
+ String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+ if (value.length() == 0) {
+ tikaHostname = TikaConfig.TIKAHOSTNAME_DEFAULT;
+ } else {
+ tikaHostname = value;
+ }
+ } else if (sn.getType().equals(TikaConfig.NODE_TIKAPORT)) {
+ String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+ if (value.length() == 0) {
+ tikaPort = TikaConfig.TIKAPORT_DEFAULT;
+ } else {
+ tikaPort = Integer.parseInt(value);
+ }
+ } else if (sn.getType().equals(TikaConfig.NODE_TIKASERVER)) {
+ String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+ if (value.length() == 0) {
+ tikaServer = false;
+ } else {
+ tikaServer = Boolean.parseBoolean(value);
+ }
+ } else if (sn.getType().equals(TikaConfig.NODE_TIKARETRY)) {
+ String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+ if (value.length() == 0) {
+ tikaRetry = TikaConfig.TIKARETRY_DEFAULT;
+ } else {
+ tikaRetry = Long.parseLong(value);
+ }
}
}
this.keepAllMetadata = keepAllMetadata;
@@ -892,12 +1222,16 @@ public class TikaExtractor extends org.a
this.writeLimit = writeLimit;
this.ignoreTikaException = ignoreTikaException;
this.extractorClassName = extractorClassName;
+ this.tikaHostname = tikaHostname;
+ this.tikaPort = tikaPort;
+ this.tikaServer = tikaServer;
+ this.tikaRetry = tikaRetry;
}
-
+
public String toPackedString() {
StringBuilder sb = new StringBuilder();
int i;
-
+
// Mappings
final String[] sortArray = new String[sourceTargets.size()];
i = 0;
@@ -905,7 +1239,7 @@ public class TikaExtractor extends org.a
sortArray[i++] = source;
}
java.util.Arrays.sort(sortArray);
-
+
List<String> packedMappings = new ArrayList<String>();
String[] fixedList = new String[2];
for (String source : sortArray) {
@@ -913,10 +1247,10 @@ public class TikaExtractor extends org.a
StringBuilder localBuffer = new StringBuilder();
fixedList[0] = source;
fixedList[1] = target;
- packFixedList(localBuffer,fixedList,':');
+ packFixedList(localBuffer, fixedList, ':');
packedMappings.add(localBuffer.toString());
}
- packList(sb,packedMappings,'+');
+ packList(sb, packedMappings, '+');
// Keep all metadata
if (keepAllMetadata)
@@ -924,12 +1258,11 @@ public class TikaExtractor extends org.a
else
sb.append('-');
if (lowerNames)
- sb.append('+');
- else
- sb.append('-');
+ sb.append('+');
+ else
+ sb.append('-');
- if (writeLimit != TikaConfig.WRITELIMIT_DEFAULT)
- {
+ if (writeLimit != TikaConfig.WRITELIMIT_DEFAULT) {
sb.append('+');
sb.append(writeLimit);
}
@@ -939,55 +1272,60 @@ public class TikaExtractor extends org.a
else
sb.append('-');
- if (extractorClassName != null)
- {
+ if (extractorClassName != null) {
sb.append('+');
sb.append(extractorClassName);
- }
- else
+ } else
sb.append('-');
-
+
return sb.toString();
}
-
+
+ public URI metaURI() {
+ return metaURI;
+ }
+
+ public URI contentURI() {
+ return contentURI;
+ }
+
public String getMapping(String source) {
return sourceTargets.get(source);
}
-
+
public boolean keepAllMetadata() {
return keepAllMetadata;
}
-
+
public boolean lowerNames() {
return lowerNames;
}
-
+
public int writeLimit() {
return writeLimit;
}
-
+
public boolean ignoreTikaException() {
return ignoreTikaException;
}
-
- public BoilerpipeExtractor getExtractorClassInstance()
- throws ManifoldCFException {
+
+ public BoilerpipeExtractor getExtractorClassInstance() throws
ManifoldCFException {
if (extractorClassName == null)
return null;
try {
ClassLoader loader = BoilerpipeExtractor.class.getClassLoader();
Class extractorClass = loader.loadClass(extractorClassName);
java.lang.reflect.Field f = extractorClass.getField("INSTANCE");
- return (BoilerpipeExtractor)f.get(null);
+ return (BoilerpipeExtractor) f.get(null);
} catch (ClassNotFoundException e) {
- throw new ManifoldCFException("Boilerpipe extractor class
'"+extractorClassName+"' not found: "+e.getMessage(),e);
+ throw new ManifoldCFException(
+ "Boilerpipe extractor class '" + extractorClassName + "' not
found: " + e.getMessage(), e);
} catch (Exception e) {
- throw new ManifoldCFException("Boilerpipe extractor class
'"+extractorClassName+"' exception on instantiation: "+e.getMessage(),e);
+ throw new ManifoldCFException(
+ "Boilerpipe extractor class '" + extractorClassName + "' exception
on instantiation: " + e.getMessage(), e);
}
}
}
}
-
-
Modified:
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties?rev=1794722&r1=1794721&r2=1794722&view=diff
==============================================================================
---
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
(original)
+++
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
Wed May 10 13:27:33 2017
@@ -13,6 +13,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+TikaExtractor.TikaHostname=Tika hostname:
+TikaExtractor.TikaPort=Tika port:
+TikaExtractor.TikaRetry=Retry interval (ms):
+TikaExtractor.TikaParsersSelected=Tika Parsers
+TikaExtractor.TikaServerSelected=Tika Server
+TikaExtractor.TikaTypeTabName=Tika type
+TikaExtractor.TikaType=Tika type:
TikaExtractor.FieldMappingTabName=Field mapping
TikaExtractor.ExceptionsTabName=Exceptions
TikaExtractor.BoilerplateTabName=Boilerplate
Modified:
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_es_ES.properties
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_es_ES.properties?rev=1794722&r1=1794721&r2=1794722&view=diff
==============================================================================
---
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_es_ES.properties
(original)
+++
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_es_ES.properties
Wed May 10 13:27:33 2017
@@ -36,4 +36,11 @@ TikaExtractor.AddFieldMapping=Añadir
TikaExtractor.Delete=Borrar
TikaExtractor.DeleteFieldMapping=Eliminar asignación de campos
TikaExtractor.NoFieldNameSpecified=Por favor, especifique un nombre de campo
-TikaExtractor.IgnoreTikaExceptions=No haga caso de excepciones Tika:
\ No newline at end of file
+TikaExtractor.IgnoreTikaExceptions=No haga caso de excepciones Tika:
+TikaExtractor.TikaHostname=Tika hostname:
+TikaExtractor.TikaPort=Tika port:
+TikaExtractor.TikaRetry=Intervalo de reintento (ms):
+TikaExtractor.TikaParsersSelected=Tika Parsers
+TikaExtractor.TikaServerSelected=Tika Server
+TikaExtractor.TikaTypeTabName=Tika type
+TikaExtractor.TikaType=Tika type:
Modified:
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties?rev=1794722&r1=1794721&r2=1794722&view=diff
==============================================================================
---
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
(original)
+++
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
Wed May 10 13:27:33 2017
@@ -13,6 +13,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+TikaExtractor.TikaHostname=Tika hostname:
+TikaExtractor.TikaPort=Tika port:
+TikaExtractor.TikaRetry=Retry interval (ms):
+TikaExtractor.TikaParsersSelected=Tika Parsers
+TikaExtractor.TikaServerSelected=Tika Server
+TikaExtractor.TikaTypeTabName=Tika type
+TikaExtractor.TikaType=Tika type:
TikaExtractor.FieldMappingTabName=ãã£ã¼ã«ããããã³ã°
TikaExtractor.ExceptionsTabName=ä¾å¤
TikaExtractor.BoilerplateTabName=Boilerplate
Modified:
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties?rev=1794722&r1=1794721&r2=1794722&view=diff
==============================================================================
---
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
(original)
+++
manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
Wed May 10 13:27:33 2017
@@ -13,6 +13,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+TikaExtractor.TikaHostname=Tika hostname:
+TikaExtractor.TikaPort=Tika port:
+TikaExtractor.TikaRetry=Retry interval (ms):
+TikaExtractor.TikaParsersSelected=Tika Parsers
+TikaExtractor.TikaServerSelected=Tika Server
+TikaExtractor.TikaTypeTabName=Tika type
+TikaExtractor.TikaType=Tika type:
TikaExtractor.FieldMappingTabName=åæ®µæ å°
TikaExtractor.ExceptionsTabName=å¼å¸¸
TikaExtractor.BoilerplateTabName=Boilerplate