Author: kwright
Date: Fri Aug 10 08:59:27 2018
New Revision: 1837783
URL: http://svn.apache.org/viewvc?rev=1837783&view=rev
Log:
Get rid of more tabs
Modified:
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java
Modified:
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java?rev=1837783&r1=1837782&r2=1837783&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java
(original)
+++
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java
Fri Aug 10 08:59:27 2018
@@ -41,160 +41,160 @@ import java.util.regex.PatternSyntaxExce
public class HtmlExtractor extends
org.apache.manifoldcf.agents.transformation.BaseTransformationConnector
{
- public static final String _rcsid = "@(#)$Id$";
+ public static final String _rcsid = "@(#)$Id$";
- protected static final String ACTIVITY_PROCESS = "process";
+ protected static final String ACTIVITY_PROCESS = "process";
- protected static final String[] activitiesList = new
String[]{ACTIVITY_PROCESS};
+ protected static final String[] activitiesList = new
String[]{ACTIVITY_PROCESS};
- /**
- * Forward to the javascript to check the specification parameters for
the job
- */
- private static final String EDIT_CONFIGURATION_JS =
"editConfiguration.js";
-
- private static final String VIEW_CONFIGURATION_HTML =
"viewConfiguration.html";
- private static final String EDIT_SPECIFICATION_JS =
"editSpecification.js";
- private static final String VIEW_SPECIFICATION_HTML =
"viewSpecification.html";
- private static final String EDIT_SPECIFICATION_HTML_EXTRACTOR_HTML =
"editSpecification_HTML_Extractor.html";
-
-
-
- protected static final int HTML_STRIP_NONE = 0;
- protected static final int HTML_STRIP_ALL = 1;
-
- protected static int html_strip_usage = HTML_STRIP_ALL;
-
- public static final String NODE_KEEPMETADATA = "striphtml";
- public static final String NODE_FILTEREMPTY = "filterEmpty";
- public static final String ATTRIBUTE_SOURCE = "source";
- public static final String ATTRIBUTE_TARGET = "target";
- public static final String ATTRIBUTE_VALUE = "value";
-
- /** We handle up to 64K in memory; after that we go to disk. */
- protected static final long inMemoryMaximumFile = 65536;
-
- /** Return a list of activities that this connector generates.
- * The connector does NOT need to be connected before this method is
called.
- *@return the set of activities.
- */
- @Override
- public String[] getActivitiesList()
- {
- return activitiesList;
- }
-
- /** Add (or replace) a document in the output data store using the
connector.
- * This method presumes that the connector object has been configured,
and it is thus able to communicate with the output data store should that be
- * necessary.
- * The OutputSpecification is *not* provided to this method, because
the goal is consistency, and if output is done it must be consistent with the
- * output description, since that was what was partly used to determine
if output should be taking place. So it may be necessary for this method to
decode
- * an output description string in order to determine what should be
done.
- *@param documentURI is the URI of the document. The URI is presumed
to be the unique identifier which the output data store will use to process
- * and serve the document. This URI is constructed by the repository
connector which fetches the document, and is thus universal across all output
connectors.
- *@param outputDescription is the description string that was
constructed for this document by the getOutputDescription() method.
- *@param document is the document data to be processed (handed to the
output data store).
- *@param authorityNameString is the name of the authority responsible
for authorizing any access tokens passed in with the repository document. May
be null.
- *@param activities is the handle to an object that the implementer of
a pipeline connector may use to perform operations, such as logging processing
activity,
- * or sending a modified document to the next stage in the pipeline.
- *@return the document status (accepted or permanently rejected).
- *@throws IOException only if there's a stream error reading the
document data.
- */
- @Override
- public int addOrReplaceDocumentWithException(String documentURI,
VersionContext pipelineDescription, RepositoryDocument document, String
authorityNameString, IOutputAddActivity activities)
- throws ManifoldCFException, ServiceInterruption,
IOException
- {
- long startTime = System.currentTimeMillis();
- String resultCode = "OK";
- String description = null;
- Long length = null;
-
- final SpecPacker sp = new
SpecPacker(pipelineDescription.getSpecification());
-
-
- Logging.root.info("Processing by HTML Extractor");
- if (!(document.getMimeType().startsWith("text/html")) ||
(document.getMimeType().startsWith("application/xhtml+xml"))){
- Logging.root.warn("no processing, mime type not html");
- resultCode = "NO HTML";
-
- }
-
- else {
- try
- {
- Logging.root.info("Document recognized as HTML
- processing");
- long binaryLength = document.getBinaryLength();
-
-
- length = new Long(binaryLength);
-
- /*
- DestinationStorage ds;
- if (document.getBinaryLength() <=
inMemoryMaximumFile)
- {
- ds = new
MemoryDestinationStorage((int)document.getBinaryLength());
- }
- else
- {
- ds = new FileDestinationStorage();
- }
- try
- {
- OutputStream os = ds.getOutputStream();
- */
-
-
- //TODO
- /* Add an option to keep HTML markup of the
extracted text or not -
- * in case for example of processing by Tika
after this transformation connector
- *
- */
- Hashtable<String,String> metadataExtracted =
new Hashtable<String,String>();
-
- metadataExtracted =
JsoupProcessing.extractTextAndMetadataHtmlDocument(document.getBinaryStream(),sp.includeFilters.get(0),
sp.excludeFilters, sp.striphtml);
- InputStream newStream = new
ByteArrayInputStream(metadataExtracted.get("extractedDoc").getBytes(StandardCharsets.UTF_8));
- int lenghtNewStream = newStream.available();
- document.setBinary(newStream, lenghtNewStream);
- Iterator<Entry<String, String>> it;
- Map.Entry<String,String> entry;
-
- it = metadataExtracted.entrySet().iterator();
- while (it.hasNext()) {
- entry = it.next();
- if (entry.getKey()!="extractedDoc")
-
document.addField("jsoup_"+entry.getKey(), entry.getValue());
-
- }
-
- return
activities.sendDocument(documentURI,document);
- }
- catch (ServiceInterruption e)
- {
- resultCode = "SERVICEINTERRUPTION";
- description = e.getMessage();
- throw e;
- }
- catch (ManifoldCFException e)
- {
- resultCode = "EXCEPTION";
- description = e.getMessage();
- throw e;
- }
- catch (IOException e)
- {
- resultCode = "IOEXCEPTION";
- description = e.getMessage();
- throw e;
- }
-
- catch (Exception e)
- {
-
- resultCode =
e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
- description = e.getMessage();
- }
- finally
- {
- activities.recordActivity(new Long(startTime),
ACTIVITY_PROCESS, length, documentURI,
+ /**
+ * Forward to the javascript to check the specification parameters for the
job
+ */
+ private static final String EDIT_CONFIGURATION_JS = "editConfiguration.js";
+
+ private static final String VIEW_CONFIGURATION_HTML =
"viewConfiguration.html";
+ private static final String EDIT_SPECIFICATION_JS = "editSpecification.js";
+ private static final String VIEW_SPECIFICATION_HTML =
"viewSpecification.html";
+ private static final String EDIT_SPECIFICATION_HTML_EXTRACTOR_HTML =
"editSpecification_HTML_Extractor.html";
+
+
+
+ protected static final int HTML_STRIP_NONE = 0;
+ protected static final int HTML_STRIP_ALL = 1;
+
+ protected static int html_strip_usage = HTML_STRIP_ALL;
+
+ public static final String NODE_KEEPMETADATA = "striphtml";
+ public static final String NODE_FILTEREMPTY = "filterEmpty";
+ public static final String ATTRIBUTE_SOURCE = "source";
+ public static final String ATTRIBUTE_TARGET = "target";
+ public static final String ATTRIBUTE_VALUE = "value";
+
+ /** We handle up to 64K in memory; after that we go to disk. */
+ protected static final long inMemoryMaximumFile = 65536;
+
+ /** Return a list of activities that this connector generates.
+ * The connector does NOT need to be connected before this method is called.
+ *@return the set of activities.
+ */
+ @Override
+ public String[] getActivitiesList()
+ {
+ return activitiesList;
+ }
+
+ /** Add (or replace) a document in the output data store using the connector.
+ * This method presumes that the connector object has been configured, and
it is thus able to communicate with the output data store should that be
+ * necessary.
+ * The OutputSpecification is *not* provided to this method, because the
goal is consistency, and if output is done it must be consistent with the
+ * output description, since that was what was partly used to determine if
output should be taking place. So it may be necessary for this method to decode
+ * an output description string in order to determine what should be done.
+ *@param documentURI is the URI of the document. The URI is presumed to be
the unique identifier which the output data store will use to process
+ * and serve the document. This URI is constructed by the repository
connector which fetches the document, and is thus universal across all output
connectors.
+ *@param outputDescription is the description string that was constructed
for this document by the getOutputDescription() method.
+ *@param document is the document data to be processed (handed to the output
data store).
+ *@param authorityNameString is the name of the authority responsible for
authorizing any access tokens passed in with the repository document. May be
null.
+ *@param activities is the handle to an object that the implementer of a
pipeline connector may use to perform operations, such as logging processing
activity,
+ * or sending a modified document to the next stage in the pipeline.
+ *@return the document status (accepted or permanently rejected).
+ *@throws IOException only if there's a stream error reading the document
data.
+ */
+ @Override
+ public int addOrReplaceDocumentWithException(String documentURI,
VersionContext pipelineDescription, RepositoryDocument document, String
authorityNameString, IOutputAddActivity activities)
+ throws ManifoldCFException, ServiceInterruption, IOException
+ {
+ long startTime = System.currentTimeMillis();
+ String resultCode = "OK";
+ String description = null;
+ Long length = null;
+
+ final SpecPacker sp = new
SpecPacker(pipelineDescription.getSpecification());
+
+
+ Logging.root.info("Processing by HTML Extractor");
+ if (!(document.getMimeType().startsWith("text/html")) ||
(document.getMimeType().startsWith("application/xhtml+xml"))){
+ Logging.root.warn("no processing, mime type not html");
+ resultCode = "NO HTML";
+
+ }
+
+ else {
+ try
+ {
+ Logging.root.info("Document recognized as HTML - processing");
+ long binaryLength = document.getBinaryLength();
+
+
+ length = new Long(binaryLength);
+
+ /*
+ DestinationStorage ds;
+ if (document.getBinaryLength() <= inMemoryMaximumFile)
+ {
+ ds = new MemoryDestinationStorage((int)document.getBinaryLength());
+ }
+ else
+ {
+ ds = new FileDestinationStorage();
+ }
+ try
+ {
+ OutputStream os = ds.getOutputStream();
+ */
+
+
+ //TODO
+ /* Add an option to keep HTML markup of the extracted text or not -
+ * in case for example of processing by Tika after this transformation
connector
+ *
+ */
+ Hashtable<String,String> metadataExtracted = new
Hashtable<String,String>();
+
+ metadataExtracted =
JsoupProcessing.extractTextAndMetadataHtmlDocument(document.getBinaryStream(),sp.includeFilters.get(0),
sp.excludeFilters, sp.striphtml);
+ InputStream newStream = new
ByteArrayInputStream(metadataExtracted.get("extractedDoc").getBytes(StandardCharsets.UTF_8));
+ int lenghtNewStream = newStream.available();
+ document.setBinary(newStream, lenghtNewStream);
+ Iterator<Entry<String, String>> it;
+ Map.Entry<String,String> entry;
+
+ it = metadataExtracted.entrySet().iterator();
+ while (it.hasNext()) {
+ entry = it.next();
+ if (entry.getKey()!="extractedDoc")
+ document.addField("jsoup_"+entry.getKey(), entry.getValue());
+
+ }
+
+ return activities.sendDocument(documentURI,document);
+ }
+ catch (ServiceInterruption e)
+ {
+ resultCode = "SERVICEINTERRUPTION";
+ description = e.getMessage();
+ throw e;
+ }
+ catch (ManifoldCFException e)
+ {
+ resultCode = "EXCEPTION";
+ description = e.getMessage();
+ throw e;
+ }
+ catch (IOException e)
+ {
+ resultCode = "IOEXCEPTION";
+ description = e.getMessage();
+ throw e;
+ }
+
+ catch (Exception e)
+ {
+
+ resultCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
+ description = e.getMessage();
+ }
+ finally
+ {
+ activities.recordActivity(new Long(startTime), ACTIVITY_PROCESS,
length, documentURI,
resultCode, description);
}