Author: kwright
Date: Fri Mar 16 16:31:26 2018
New Revision: 1827009

URL: http://svn.apache.org/viewvc?rev=1827009&view=rev
Log:
Commit initial contribution (with path changes)

Added:
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/build.xml
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/com/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/com/francelabs/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/com/francelabs/datafari/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractorConfig.java
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/Messages.java
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/HtmlExtractorException.java
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/RegexException.java
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/com/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/com/francelabs/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/com/francelabs/datafari/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_en_US.properties
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_es_ES.properties
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_ja_JP.properties
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_zh_CN.properties
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/com/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/com/francelabs/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/com/francelabs/datafari/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editConfiguration.js
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification.js
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification_HTML_Extractor.html
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewConfiguration.html
    
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewSpecification.html

Added: 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/build.xml
URL: 
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/build.xml?rev=1827009&view=auto
==============================================================================
--- 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/build.xml
 (added)
+++ 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/build.xml
 Fri Mar 16 16:31:26 2018
@@ -0,0 +1,59 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<project name="html" default="all">
+
+    <property environment="env"/>
+    <condition property="mcf-dist" value="${env.MCFDISTPATH}">
+        <isset property="env.MCFDISTPATH"/>
+    </condition>
+    <property name="abs-dist" location="../../dist"/>
+    <condition property="mcf-dist" value="${abs-dist}">
+        <not>
+            <isset property="env.MCFDISTPATH"/>
+        </not>
+    </condition>
+
+    <import file="${mcf-dist}/connector-build.xml"/>
+    
+    <path id="connector-classpath">
+        <path refid="mcf-connector-build.connector-classpath"/>
+        <fileset dir="../../lib">
+            <include name="jsoup*.jar"/>
+            
+        </fileset>
+    </path>
+
+    <target name="lib" depends="mcf-connector-build.lib,precompile-check" 
if="canBuild">
+        <mkdir dir="dist/lib"/>
+        <copy todir="dist/lib">
+            <fileset dir="../../lib">
+                 <include name="jsoup*.jar"/>
+            </fileset>
+        </copy>
+    </target>
+
+    <target name="deliver-connector" 
depends="mcf-connector-build.deliver-connector">
+        <antcall target="general-add-transformation-connector">
+            <param name="connector-label" value="Html extractor"/>
+            <param name="connector-class" 
value="org.apache.manifoldcf.agents.transformers.htmlextractor.HtmlExtractor"/>
+        </antcall>
+    </target>
+
+</project>
+
+

Added: 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java
URL: 
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java?rev=1827009&view=auto
==============================================================================
--- 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java
 (added)
+++ 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractor.java
 Fri Mar 16 16:31:26 2018
@@ -0,0 +1,753 @@
+package com.francelabs.datafari.htmlextractor;
+
+/* $Id$ */
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.core.system.Logging;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+
+
+import com.francelabs.datafari.htmlextractor.exception.RegexException;
+
+import org.apache.manifoldcf.agents.interfaces.*;
+
+import java.io.*;
+import java.nio.charset.StandardCharsets;
+import java.util.*;
+import java.util.Map.Entry;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+/** This connector works as a transformation connector, but does nothing other 
than logging.
+ *
+ */
+public class HtmlExtractor extends 
org.apache.manifoldcf.agents.transformation.BaseTransformationConnector
+{
+
+       public static final String _rcsid = "@(#)$Id$";
+
+       protected static final String ACTIVITY_PROCESS = "process";
+
+       protected static final String[] activitiesList = new 
String[]{ACTIVITY_PROCESS};
+
+       /**
+        * Forward to the javascript to check the specification parameters for 
the job
+        */
+       private static final String EDIT_CONFIGURATION_JS = 
"editConfiguration.js";
+
+       private static final String VIEW_CONFIGURATION_HTML = 
"viewConfiguration.html";
+       private static final String EDIT_SPECIFICATION_JS = 
"editSpecification.js";
+       private static final String VIEW_SPECIFICATION_HTML = 
"viewSpecification.html";
+       private static final String EDIT_SPECIFICATION_HTML_EXTRACTOR_HTML = 
"editSpecification_HTML_Extractor.html";
+
+
+       /** We handle up to 64K in memory; after that we go to disk. */
+       protected static final long inMemoryMaximumFile = 65536;
+
+       /** Return a list of activities that this connector generates.
+        * The connector does NOT need to be connected before this method is 
called.
+        *@return the set of activities.
+        */
+       @Override
+       public String[] getActivitiesList()
+       {
+               return activitiesList;
+       }
+
+       /** Add (or replace) a document in the output data store using the 
connector.
+        * This method presumes that the connector object has been configured, 
and it is thus able to communicate with the output data store should that be
+        * necessary.
+        * The OutputSpecification is *not* provided to this method, because 
the goal is consistency, and if output is done it must be consistent with the
+        * output description, since that was what was partly used to determine 
if output should be taking place.  So it may be necessary for this method to 
decode
+        * an output description string in order to determine what should be 
done.
+        *@param documentURI is the URI of the document.  The URI is presumed 
to be the unique identifier which the output data store will use to process
+        * and serve the document.  This URI is constructed by the repository 
connector which fetches the document, and is thus universal across all output 
connectors.
+        *@param outputDescription is the description string that was 
constructed for this document by the getOutputDescription() method.
+        *@param document is the document data to be processed (handed to the 
output data store).
+        *@param authorityNameString is the name of the authority responsible 
for authorizing any access tokens passed in with the repository document.  May 
be null.
+        *@param activities is the handle to an object that the implementer of 
a pipeline connector may use to perform operations, such as logging processing 
activity,
+        * or sending a modified document to the next stage in the pipeline.
+        *@return the document status (accepted or permanently rejected).
+        *@throws IOException only if there's a stream error reading the 
document data.
+        */
+       @Override
+       public int addOrReplaceDocumentWithException(String documentURI, 
VersionContext pipelineDescription, RepositoryDocument document, String 
authorityNameString, IOutputAddActivity activities)
+                       throws ManifoldCFException, ServiceInterruption, 
IOException
+       {
+               long startTime = System.currentTimeMillis();
+               String resultCode = "OK";
+               String description = null;
+               Long length = null;
+
+               final SpecPacker sp = new 
SpecPacker(pipelineDescription.getSpecification());
+
+
+               Logging.root.info("Processing by HTML Extractor");
+               if (!(document.getMimeType().startsWith("text/html")) || 
(document.getMimeType().startsWith("application/xhtml+xml"))){
+                       Logging.root.warn("no processing, mime type not html");
+                       resultCode = "NO HTML";
+
+               }
+
+               else {
+                       try
+                       {
+                               Logging.root.info("Document recognized as HTML 
- processing");
+                               long binaryLength = document.getBinaryLength();
+
+
+                               length =  new Long(binaryLength);
+
+                               /*
+
+                               DestinationStorage ds;
+
+                               if (document.getBinaryLength() <= 
inMemoryMaximumFile)
+                               {
+                                       ds = new 
MemoryDestinationStorage((int)document.getBinaryLength());
+                               }
+                               else
+                               {
+                                       ds = new FileDestinationStorage();
+                               }
+                               try
+                             {
+                               OutputStream os = ds.getOutputStream();
+                                */
+
+
+                               //TODO
+                               /* Add an option to keep HTML markup of the 
extracted text or not - 
+                                * in case for example of processing by Tika 
after this transformation connector
+                                * 
+                                */
+                               Hashtable<String,String> metadataExtracted = 
new Hashtable<String,String>();
+                               metadataExtracted = 
JsoupProcessing.extractTextAndMetadataHtmlDocument(document.getBinaryStream(),sp.includeFilters.get(0),
 sp.excludeFilters);
+                               InputStream newStream = new 
ByteArrayInputStream(metadataExtracted.get("extractedDoc").getBytes(StandardCharsets.UTF_8));
+                               int lenghtNewStream = newStream.available();
+                               document.setBinary(newStream, lenghtNewStream);
+                               Iterator<Entry<String, String>> it;
+                               Map.Entry<String,String> entry;
+
+                               it = metadataExtracted.entrySet().iterator();
+                               while (it.hasNext()) {
+                                       entry = it.next();
+                                       if (entry.getKey()!="extractedDoc")
+                                               
document.addField("jsoup_"+entry.getKey(), entry.getValue());
+
+                               }
+
+                               return 
activities.sendDocument(documentURI,document);
+                       }
+                       catch (ServiceInterruption e)
+                       {
+                               resultCode = "SERVICEINTERRUPTION";
+                               description = e.getMessage();
+                               throw e;
+                       }
+                       catch (ManifoldCFException e)
+                       {
+                               resultCode = "EXCEPTION";
+                               description = e.getMessage();
+                               throw e;
+                       }
+                       catch (IOException e)
+                       {
+                               resultCode = "IOEXCEPTION";
+                               description = e.getMessage();
+                               throw e;
+                       }
+
+                       catch (Exception e)
+                       {
+
+                               resultCode = 
e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
+                               description = e.getMessage();
+                       }
+                       finally
+                       {
+                               activities.recordActivity(new Long(startTime), 
ACTIVITY_PROCESS, length, documentURI,
+                                               resultCode, description);
+                       }
+
+
+               }
+
+               return activities.sendDocument(documentURI,document);
+       }
+
+
+       protected static interface DestinationStorage
+       {
+               /** Get the output stream to write to.  Caller should 
explicitly close this stream when done writing.
+                */
+               public OutputStream getOutputStream()
+                               throws ManifoldCFException;
+
+               /** Get new binary length.
+                */
+               public long getBinaryLength()
+                               throws ManifoldCFException;
+
+               /** Get the input stream to read from.  Caller should 
explicitly close this stream when done reading.
+                */
+               public InputStream getInputStream()
+                               throws ManifoldCFException;
+
+               /** Close the object and clean up everything.
+                * This should be called when the data is no longer needed.
+                */
+               public void close()
+                               throws ManifoldCFException;
+       }
+
+       protected static class FileDestinationStorage implements 
DestinationStorage
+       {
+               protected final File outputFile;
+               protected final OutputStream outputStream;
+
+               public FileDestinationStorage()
+                               throws ManifoldCFException
+               {
+                       File outputFile;
+                       OutputStream outputStream;
+                       try
+                       {
+                               outputFile = 
File.createTempFile("mcftika","tmp");
+                               outputStream = new FileOutputStream(outputFile);
+                       }
+                       catch (IOException e)
+                       {
+                               handleIOException(e);
+                               outputFile = null;
+                               outputStream = null;
+                       }
+                       this.outputFile = outputFile;
+                       this.outputStream = outputStream;
+               }
+
+               @Override
+               public OutputStream getOutputStream()
+                               throws ManifoldCFException
+               {
+                       return outputStream;
+               }
+
+               /** Get new binary length.
+                */
+               @Override
+               public long getBinaryLength()
+                               throws ManifoldCFException
+               {
+                       return outputFile.length();
+               }
+
+               /** Get the input stream to read from.  Caller should 
explicitly close this stream when done reading.
+                */
+               @Override
+               public InputStream getInputStream()
+                               throws ManifoldCFException
+               {
+                       try
+                       {
+                               return new FileInputStream(outputFile);
+                       }
+                       catch (IOException e)
+                       {
+                               handleIOException(e);
+                               return null;
+                       }
+               }
+
+               private void handleIOException(IOException e) {
+                       // TODO Auto-generated method stub
+
+               }
+
+               /** Close the object and clean up everything.
+                * This should be called when the data is no longer needed.
+                */
+               @Override
+               public void close()
+                               throws ManifoldCFException
+               {
+                       outputFile.delete();
+               }
+
+       }
+
+       protected static class MemoryDestinationStorage implements 
DestinationStorage
+       {
+               protected final ByteArrayOutputStream outputStream;
+
+               public MemoryDestinationStorage(int sizeHint)
+               {
+                       outputStream = new ByteArrayOutputStream(sizeHint);
+               }
+
+               @Override
+               public OutputStream getOutputStream()
+                               throws ManifoldCFException
+               {
+                       return outputStream;
+               }
+
+               /** Get new binary length.
+                */
+               @Override
+               public long getBinaryLength()
+                               throws ManifoldCFException
+               {
+                       return outputStream.size();
+               }
+
+               /** Get the input stream to read from.  Caller should 
explicitly close this stream when done reading.
+                */
+               @Override
+               public InputStream getInputStream()
+                               throws ManifoldCFException
+               {
+                       return new 
ByteArrayInputStream(outputStream.toByteArray());
+               }
+
+               /** Close the object and clean up everything.
+                * This should be called when the data is no longer needed.
+                */
+               public void close()
+                               throws ManifoldCFException
+               {
+               }
+               protected static int handleIOException(IOException e)
+                               throws ManifoldCFException
+               {
+                       // IOException reading from our local storage...
+                       if (e instanceof InterruptedIOException)
+                               throw new 
ManifoldCFException(e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+                       throw new ManifoldCFException(e.getMessage(),e);
+               }
+
+       }
+       /**
+        * Test if there is at least one regular expression that match with the
+        * provided sting
+        *
+        * @param regexList
+        *          the list of regular expressions
+        * @param str
+        *          the string to test
+        * @return the first matching regex found or null if no matching regex
+        */
+       private String matchingRegex(final List<String> regexList, final String 
str) throws RegexException {
+               for (final String regex : regexList) {
+                       try {
+                               final Pattern pattern = Pattern.compile(regex);
+                               final Matcher matcher = pattern.matcher(str);
+                               if (matcher.find()) {
+                                       return regex;
+                               }
+                       } catch (final PatternSyntaxException e) {
+                               throw new RegexException(regex, "Invalid 
regular expression");
+                       }
+               }
+               return null;
+       }
+
+
+
+
+
+
+
+       /**
+        * Output the configuration header section. This method is called in 
the head
+        * section of the connector's configuration page. Its purpose is to add 
the
+        * required tabs to the list, and to output any javascript methods that 
might
+        * be needed by the configuration editing HTML.
+        *
+        * @param threadContext
+        *          is the local thread context.
+        * @param out
+        *          is the output to which any HTML should be sent.
+        * @param parameters
+        *          are the configuration parameters, as they currently exist, 
for
+        *          this connection being configured.
+        * @param tabsArray
+        *          is an array of tab names. Add to this array any tab names 
that are
+        *          specific to the connector.
+        */
+       @Override
+       public void outputConfigurationHeader(final IThreadContext 
threadContext, final IHTTPOutput out, final Locale locale,
+                       final ConfigParams parameters, final List<String> 
tabsArray) throws ManifoldCFException, IOException {
+
+               Messages.outputResourceWithVelocity(out, locale, 
EDIT_CONFIGURATION_JS, null);
+       }
+
+       /**
+        * Output the configuration body section. This method is called in the 
body
+        * section of the connector's configuration page. Its purpose is to 
present
+        * the required form elements for editing. The coder can presume that 
the HTML
+        * that is output from this configuration will be within appropriate 
<html>,
+        * <body>, and <form> tags. The name of the form is "editconnection".
+        *
+        * @param threadContext
+        *          is the local thread context.
+        * @param out
+        *          is the output to which any HTML should be sent.
+        * @param parameters
+        *          are the configuration parameters, as they currently exist, 
for
+        *          this connection being configured.
+        * @param tabName
+        *          is the current tab name.
+        */
+       @Override
+       public void outputConfigurationBody(final IThreadContext threadContext, 
final IHTTPOutput out, final Locale locale,
+                       final ConfigParams parameters, final String tabName) 
throws ManifoldCFException, IOException {
+               final Map<String, Object> velocityContext = new HashMap<>();
+               velocityContext.put("TabName", tabName);
+
+       }
+
+       /**
+        * Process a configuration post. This method is called at the start of 
the
+        * connector's configuration page, whenever there is a possibility that 
form
+        * data for a connection has been posted. Its purpose is to gather form
+        * information and modify the configuration parameters accordingly. The 
name
+        * of the posted form is "editconnection".
+        *
+        * @param threadContext
+        *          is the local thread context.
+        * @param variableContext
+        *          is the set of variables available from the post, including 
binary
+        *          file post information.
+        * @param parameters
+        *          are the configuration parameters, as they currently exist, 
for
+        *          this connection being configured.
+        * @return null if all is well, or a string error message if there is 
an error
+        *         that should prevent saving of the connection (and cause a
+        *         redirection to an error page).
+        */
+       @Override
+       public String processConfigurationPost(final IThreadContext 
threadContext, final IPostParameters variableContext,
+                       final Locale locale, final ConfigParams parameters) 
throws ManifoldCFException {
+
+
+               return null;
+       }
+
+       /**
+        * View configuration. This method is called in the body section of the
+        * connector's view configuration page. Its purpose is to present the
+        * connection information to the user. The coder can presume that the 
HTML
+        * that is output from this configuration will be within appropriate 
<html>
+        * and <body> tags.
+        *
+        * @param threadContext
+        *          is the local thread context.
+        * @param out
+        *          is the output to which any HTML should be sent.
+        * @param parameters
+        *          are the configuration parameters, as they currently exist, 
for
+        *          this connection being configured.
+        */
+       @Override
+       public void viewConfiguration(final IThreadContext threadContext, final 
IHTTPOutput out, final Locale locale,
+                       final ConfigParams parameters) throws 
ManifoldCFException, IOException {
+               final Map<String, Object> velocityContext = new HashMap<>();
+               Messages.outputResourceWithVelocity(out, locale, 
VIEW_CONFIGURATION_HTML, velocityContext);
+       }
+
+       protected static void fillInHtmlExtractorSpecification(final 
Map<String, Object> paramMap, final Specification os) {
+
+               final List<String> includeFilters = new ArrayList<String>();
+               final List<String> excludeFilters = new ArrayList<String>();
+
+
+
+
+               // Fill in context
+
+
+               for (int i = 0; i < os.getChildCount(); i++) {
+                       final SpecificationNode sn = os.getChild(i);
+                       if 
(sn.getType().equals(HtmlExtractorConfig.NODE_INCLUDEFILTER)) {
+                               final String includeFilter = 
sn.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX);
+                               if (includeFilter != null) {
+                                       includeFilters.add(includeFilter);
+                               }
+                       } else if 
(sn.getType().equals(HtmlExtractorConfig.NODE_EXCLUDEFILTER)) {
+                               final String excludeFilter = 
sn.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX);
+                               if (excludeFilter != null) {
+                                       excludeFilters.add(excludeFilter);
+                               }
+                       }
+
+
+               }
+
+               paramMap.put("INCLUDEFILTERS", includeFilters);
+               paramMap.put("EXCLUDEFILTERS", excludeFilters);
+       }
+
+       /**
+        * Output the specification header section. This method is called in 
the head
+        * section of a job page which has selected a pipeline connection of the
+        * current type. Its purpose is to add the required tabs to the list, 
and to
+        * output any javascript methods that might be needed by the job 
editing HTML.
+        *
+        * @param out
+        *          is the output to which any HTML should be sent.
+        * @param locale
+        * @param os
+        *          is the current pipeline specification for this connection.
+        * @param connectionSequenceNumber
+        *          is the unique number of this connection within the job.
+        * @param tabsArray
+        *          is an array of tab names. Add to this array any tab names 
that are
+        *          specific to the connector.
+        */
+       @Override
+       public void outputSpecificationHeader(final IHTTPOutput out, final 
Locale locale, final Specification os,
+                       final int connectionSequenceNumber, final List<String> 
tabsArray) throws ManifoldCFException, IOException {
+               final Map<String, Object> paramMap = new HashMap<>();
+               paramMap.put("SEQNUM", 
Integer.toString(connectionSequenceNumber));
+
+               tabsArray.add(Messages.getString(locale, 
"DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName"));
+
+               // Fill in the specification header map, using data from all 
tabs.
+               fillInHtmlExtractorSpecification(paramMap, os);
+
+               Messages.outputResourceWithVelocity(out, locale, 
EDIT_SPECIFICATION_JS, paramMap);
+       }
+
+       /**
+        * Output the specification body section. This method is called in the 
body
+        * section of a job page which has selected a pipeline connection of the
+        * current type. Its purpose is to present the required form elements 
for
+        * editing. The coder can presume that the HTML that is output from this
+        * configuration will be within appropriate <html>, <body>, and <form> 
tags.
+        * The name of the form is "editjob".
+        *
+        * @param out
+        *          is the output to which any HTML should be sent.
+        * @param locale
+        *          is the preferred local of the output.
+        * @param os
+        *          is the current pipeline specification for this job.
+        * @param connectionSequenceNumber
+        *          is the unique number of this connection within the job.
+        * @param actualSequenceNumber
+        *          is the connection within the job that has currently been 
selected.
+        * @param tabName
+        *          is the current tab name.
+        */
+       @Override
+       public void outputSpecificationBody(final IHTTPOutput out, final Locale 
locale, final Specification os,
+                       final int connectionSequenceNumber, final int 
actualSequenceNumber, final String tabName)
+                                       throws ManifoldCFException, IOException 
{
+               final Map<String, Object> paramMap = new HashMap<>();
+
+               // Set the tab name
+               paramMap.put("TABNAME", tabName);
+               paramMap.put("SEQNUM", 
Integer.toString(connectionSequenceNumber));
+               paramMap.put("SELECTEDNUM", 
Integer.toString(actualSequenceNumber));
+
+               // Fill in the field mapping tab data
+               fillInHtmlExtractorSpecification(paramMap, os);
+
+               Messages.outputResourceWithVelocity(out, locale, 
EDIT_SPECIFICATION_HTML_EXTRACTOR_HTML, paramMap);
+       }
+
+       /**
+        * Process a specification post. This method is called at the start of 
job's
+        * edit or view page, whenever there is a possibility that form data 
for a
+        * connection has been posted. Its purpose is to gather form 
information and
+        * modify the transformation specification accordingly. The name of the 
posted
+        * form is "editjob".
+        *
+        * @param variableContext
+        *          contains the post data, including binary file-upload 
information.
+        * @param locale
+        *          is the preferred local of the output.
+        * @param os
+        *          is the current pipeline specification for this job.
+        * @param connectionSequenceNumber
+        *          is the unique number of this connection within the job.
+        * @return null if all is well, or a string error message if there is 
an error
+        *         that should prevent saving of the job (and cause a 
redirection to
+        *         an error page).
+        */
+       @Override
+       public String processSpecificationPost(final IPostParameters 
variableContext, final Locale locale,
+                       final Specification os, final int 
connectionSequenceNumber) throws ManifoldCFException {
+
+               final String seqPrefix = "s" + connectionSequenceNumber + "_";
+
+               String x;
+
+               // Include filters
+               x = variableContext.getParameter(seqPrefix + 
"includefilter_count");
+               if (x != null && x.length() > 0) {
+                       // About to gather the includefilter nodes, so get rid 
of the old ones.
+                       int i = 0;
+                       while (i < os.getChildCount()) {
+                               final SpecificationNode node = os.getChild(i);
+                               if 
(node.getType().equals(HtmlExtractorConfig.NODE_INCLUDEFILTER)) {
+                                       os.removeChild(i);
+                               } else {
+                                       i++;
+                               }
+                       }
+                       final int count = Integer.parseInt(x);
+                       i = 0;
+                       while (i < count) {
+                               final String prefix = seqPrefix + 
"includefilter_";
+                               final String suffix = "_" + Integer.toString(i);
+                               final String op = 
variableContext.getParameter(prefix + "op" + suffix);
+                               if (op == null || !op.equals("Delete")) {
+                                       // Gather the includefilters etc.
+                                       final String regex = 
variableContext.getParameter(prefix + HtmlExtractorConfig.ATTRIBUTE_REGEX + 
suffix);
+                                       final SpecificationNode node = new 
SpecificationNode(HtmlExtractorConfig.NODE_INCLUDEFILTER);
+                                       
node.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, regex);
+                                       os.addChild(os.getChildCount(), node);
+                               }
+                               i++;
+                       }
+
+                       final String addop = 
variableContext.getParameter(seqPrefix + "includefilter_op");
+                       if (addop != null && addop.equals("Add")) {
+                               final String regex = 
variableContext.getParameter(seqPrefix + "includefilter_regex");
+                               final SpecificationNode node = new 
SpecificationNode(HtmlExtractorConfig.NODE_INCLUDEFILTER);
+                               
node.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, regex);
+                               os.addChild(os.getChildCount(), node);
+                       }
+               }
+
+               // Exclude filters
+               x = variableContext.getParameter(seqPrefix + 
"excludefilter_count");
+               if (x != null && x.length() > 0) {
+                       // About to gather the excludefilter nodes, so get rid 
of the old ones.
+                       int i = 0;
+                       while (i < os.getChildCount()) {
+                               final SpecificationNode node = os.getChild(i);
+                               if 
(node.getType().equals(HtmlExtractorConfig.NODE_EXCLUDEFILTER)) {
+                                       os.removeChild(i);
+                               } else {
+                                       i++;
+                               }
+                       }
+                       final int count = Integer.parseInt(x);
+                       i = 0;
+                       while (i < count) {
+                               final String prefix = seqPrefix + 
"excludefilter_";
+                               final String suffix = "_" + Integer.toString(i);
+                               final String op = 
variableContext.getParameter(prefix + "op" + suffix);
+                               if (op == null || !op.equals("Delete")) {
+                                       // Gather the excludefilters etc.
+                                       final String regex = 
variableContext.getParameter(prefix + HtmlExtractorConfig.ATTRIBUTE_REGEX + 
suffix);
+                                       final SpecificationNode node = new 
SpecificationNode(HtmlExtractorConfig.NODE_EXCLUDEFILTER);
+                                       
node.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, regex);
+                                       os.addChild(os.getChildCount(), node);
+                               }
+                               i++;
+                       }
+
+                       final String addop = 
variableContext.getParameter(seqPrefix + "excludefilter_op");
+                       if (addop != null && addop.equals("Add")) {
+                               final String regex = 
variableContext.getParameter(seqPrefix + "excludefilter_regex");
+                               final SpecificationNode node = new 
SpecificationNode(HtmlExtractorConfig.NODE_EXCLUDEFILTER);
+                               
node.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, regex);
+                               os.addChild(os.getChildCount(), node);
+                       }
+               }
+
+               return null;
+       }
+
+       /**
+        * View specification. This method is called in the body section of a 
job's
+        * view page. Its purpose is to present the pipeline specification 
information
+        * to the user. The coder can presume that the HTML that is output from 
this
+        * configuration will be within appropriate <html> and <body> tags.
+        *
+        * @param out
+        *          is the output to which any HTML should be sent.
+        * @param locale
+        *          is the preferred local of the output.
+        * @param connectionSequenceNumber
+        *          is the unique number of this connection within the job.
+        * @param os
+        *          is the current pipeline specification for this job.
+        */
+       @Override
+       public void viewSpecification(final IHTTPOutput out, final Locale 
locale, final Specification os,
+                       final int connectionSequenceNumber) throws 
ManifoldCFException, IOException {
+               final Map<String, Object> paramMap = new HashMap<>();
+               paramMap.put("SEQNUM", 
Integer.toString(connectionSequenceNumber));
+
+               // Fill in the map with data from all tabs
+               fillInHtmlExtractorSpecification(paramMap, os);
+
+               Messages.outputResourceWithVelocity(out, locale, 
VIEW_SPECIFICATION_HTML, paramMap);
+
+       }
+       protected static class SpecPacker {
+
+               private final List<String> includeFilters = new ArrayList<>();
+               private final List<String> excludeFilters = new ArrayList<>();
+
+
+               public SpecPacker(final Specification os) {
+                       for (int i = 0; i < os.getChildCount(); i++) {
+                               final SpecificationNode sn = os.getChild(i);
+
+                               if 
(sn.getType().equals(HtmlExtractorConfig.NODE_INCLUDEFILTER)) {
+                                       final String regex = 
sn.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX);
+                                       includeFilters.add(regex);
+                               }
+
+                               if 
(sn.getType().equals(HtmlExtractorConfig.NODE_EXCLUDEFILTER)) {
+                                       final String regex = 
sn.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX);
+                                       excludeFilters.add(regex);
+                               }
+
+
+                       }
+
+                       if (includeFilters.isEmpty()) {
+                               
includeFilters.add(HtmlExtractorConfig.WHITELIST_DEFAULT);
+                       }
+               }
+
+               public String toPackedString() {
+                       final StringBuilder sb = new StringBuilder();
+
+                       packList(sb, includeFilters, '+');
+                       packList(sb, excludeFilters, '+');
+
+                       return sb.toString();
+               }
+
+       }
+}
+
+

Added: 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractorConfig.java
URL: 
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractorConfig.java?rev=1827009&view=auto
==============================================================================
--- 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractorConfig.java
 (added)
+++ 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/HtmlExtractorConfig.java
 Fri Mar 16 16:31:26 2018
@@ -0,0 +1,41 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.francelabs.datafari.htmlextractor;
+
+/**
+ * Parameters for Tika transformation connector.
+ */
+public class HtmlExtractorConfig {
+
+  // Configuration parameters
+       //TODO : remove the SOlr parameters
+  public static final String PARAM_SOLRUPDATEHANDLER = "solrUpdateHandler";
+  public static final String SOLRUPDATEHANDLER_DEFAULT = "/update/no-tika";
+  public static final String WHITELIST_DEFAULT = "body";
+  public static final String BLACKLIST_DEFAULT = "";
+  
+  // Specification nodes and values
+  public static final String NODE_INCLUDEFILTER = "includefilter";
+  public static final String NODE_EXCLUDEFILTER = "excludefilter";
+  public static final String INCLUDEFILTER_DEFAULT = "body";
+  public static final String ATTRIBUTE_REGEX = "regex";
+  public static final String ATTRIBUTE_VALUE = "value";
+
+}

Added: 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
URL: 
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java?rev=1827009&view=auto
==============================================================================
--- 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
 (added)
+++ 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
 Fri Mar 16 16:31:26 2018
@@ -0,0 +1,176 @@
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.francelabs.datafari.htmlextractor;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Hashtable;
+import java.util.List;
+
+import org.apache.manifoldcf.core.system.Logging;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+public class JsoupProcessing {
+
+
+
+
+       public static Hashtable<String,String> 
extractTextAndMetadataHtmlDocument(InputStream streamDoc,String 
whitelist,List<String> blacklist) throws IOException{
+               Document doc = Jsoup.parse(streamDoc, "UTF-8", "");
+               Hashtable<String,String> metadata = new 
Hashtable<String,String>();
+               for(Element meta : doc.select("meta")) {
+                       metadata.put(meta.attr("name"), meta.attr("content"));
+               }
+
+
+               if (doc.select("title") != null){
+                       String title = doc.select("title").text();
+                       metadata.put("title", title);
+               }
+
+               Element element_keywords = 
doc.select("meta[name='keywords']").first();
+               if (element_keywords != null) {
+                       String keywords = (element_keywords.attr("content"));
+                       metadata.put("keywords",keywords);
+               }
+
+               Element element_description = 
doc.select("meta[name=\"description\"]").first();
+               if (element_description != null) {
+                       String description = 
(element_description.attr("content"));
+                       metadata.put("description",description);
+               }
+
+               Element element_author = 
doc.select("meta[name=\"author\"]").first();
+               if (element_author != null) {
+                       String author = (element_author.attr("content"));
+                       metadata.put("author",author);
+               }
+
+
+               Element element_dcterms_subject = 
doc.select("meta[name=\"dcterms.subject\"]").first();
+               if (element_dcterms_subject != null) {
+                       String dc_terms_subject = 
(element_dcterms_subject.attr("content"));
+                       metadata.put("dc_terms_subject",dc_terms_subject);
+               }
+
+
+               Element element_dcterms_title = 
doc.select("meta[name=\"dcterms.title\"]").first();
+               if (element_dcterms_title != null) {
+                       String dc_terms_title = 
(element_dcterms_title.attr("content"));
+                       metadata.put("dc_terms_title",dc_terms_title);
+
+               }
+
+               Element element_dcterms_creator = 
doc.select("meta[name=\"dcterms.creator\"]").first();
+               if (element_dcterms_creator != null) {
+                       String dc_terms_creator = 
(element_dcterms_creator.attr("content"));
+                       metadata.put("dc_terms_creator",dc_terms_creator);
+
+               }
+
+               Element element_dcterms_description = 
doc.select("meta[name=\"dcterms.description\"]").first();
+               if (element_dcterms_description != null) {
+                       String dc_terms_description = 
(element_dcterms_description.attr("content"));
+                       
metadata.put("dc_terms_description",dc_terms_description);
+
+               }
+
+               Element element_dcterms_publisher = 
doc.select("meta[name=\"dcterms.publisher\"]").first();
+               if (element_dcterms_publisher != null) {
+                       String dc_terms_publisher = 
(element_dcterms_publisher.attr("content"));
+                       metadata.put("dc_terms_publisher",dc_terms_publisher);
+
+               }
+
+               Element element_dcterms_contributor = 
doc.select("meta[name=\"dcterms.contributor\"]").first();
+               if (element_dcterms_contributor != null) {
+                       String dc_terms_contributor = 
(element_dcterms_contributor.attr("content"));
+                       
metadata.put("dc_terms_contributor",dc_terms_contributor);
+
+               }
+
+               Element element_dcterms_date = 
doc.select("meta[name=\"dcterms.date\"]").first();
+               if (element_dcterms_date != null) {
+                       String dc_terms_date = 
(element_dcterms_date.attr("content"));
+                       metadata.put("dc_terms_date",dc_terms_date);
+
+               }
+
+               Element element_dcterms_type = 
doc.select("meta[name=\"dcterms.type\"]").first();
+               if (element_dcterms_type != null) {
+                       String dc_terms_type = 
(element_dcterms_type.attr("content"));
+                       metadata.put("dc_terms_type",dc_terms_type);
+
+               }
+
+               Element element_dcterms_format = 
doc.select("meta[name=\"dcterms.format\"]").first();
+               if (element_dcterms_format != null) {
+                       String dc_terms_format = 
(element_dcterms_format.attr("content"));
+                       metadata.put("dc_terms_format",dc_terms_format);
+
+               }
+
+               Element element_dcterms_language = 
doc.select("meta[name=\"dcterms.language\"]").first();
+               if (element_dcterms_language != null) {
+                       String dc_terms_language = 
(element_dcterms_language.attr("content"));
+                       metadata.put("dc_terms_language",dc_terms_language);
+
+               }
+
+               Element element_dcterms_identifier = 
doc.select("meta[name=\"dcterms.identifier\"]").first();
+               if (element_dcterms_identifier != null) {
+                       String dc_terms_identifier = 
(element_dcterms_identifier.attr("content"));
+                       metadata.put("dc_terms_identifier",dc_terms_identifier);
+               }
+
+
+               Element docToKeep = doc.body();
+               String finalDoc ;
+
+               // Englobing Tag
+               if (whitelist!="body"){
+                       docToKeep = doc.select(whitelist).first();
+               }
+
+
+
+               // Blacklist
+               if (blacklist != null){
+                       for (int i=0; i< blacklist.size();i++){
+                               docToKeep.select(blacklist.get(i)).remove();
+                       }
+               }
+
+               //finalDoc = docToKeep.text();
+               finalDoc = docToKeep.html();
+               metadata.put("extractedDoc",finalDoc);
+
+               return metadata;
+       }
+
+}
+
+

Added: 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/Messages.java
URL: 
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/Messages.java?rev=1827009&view=auto
==============================================================================
--- 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/Messages.java
 (added)
+++ 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/Messages.java
 Fri Mar 16 16:31:26 2018
@@ -0,0 +1,125 @@
+/* $Id: Messages.java 1596720 2014-05-22 00:57:29Z kwright $ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package com.francelabs.datafari.htmlextractor;
+
+import java.util.Locale;
+import java.util.Map;
+
+import org.apache.manifoldcf.core.interfaces.IHTTPOutput;
+import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
+
+public class Messages extends org.apache.manifoldcf.ui.i18n.Messages {
+  public static final String DEFAULT_BUNDLE_NAME = 
"com.francelabs.datafari.htmlextractor.common";
+  public static final String DEFAULT_PATH_NAME = 
"com.francelabs.datafari.htmlextractor";
+
+  /**
+   * Constructor - do no instantiate
+   */
+  protected Messages() {
+  }
+
+  public static String getString(final Locale locale, final String messageKey) 
{
+    return getString(DEFAULT_BUNDLE_NAME, locale, messageKey, null);
+  }
+
+  public static String getAttributeString(final Locale locale, final String 
messageKey) {
+    return getAttributeString(DEFAULT_BUNDLE_NAME, locale, messageKey, null);
+  }
+
+  public static String getBodyString(final Locale locale, final String 
messageKey) {
+    return getBodyString(DEFAULT_BUNDLE_NAME, locale, messageKey, null);
+  }
+
+  public static String getAttributeJavascriptString(final Locale locale, final 
String messageKey) {
+    return getAttributeJavascriptString(DEFAULT_BUNDLE_NAME, locale, 
messageKey, null);
+  }
+
+  public static String getBodyJavascriptString(final Locale locale, final 
String messageKey) {
+    return getBodyJavascriptString(DEFAULT_BUNDLE_NAME, locale, messageKey, 
null);
+  }
+
+  public static String getString(final Locale locale, final String messageKey, 
final Object[] args) {
+    return getString(DEFAULT_BUNDLE_NAME, locale, messageKey, args);
+  }
+
+  public static String getAttributeString(final Locale locale, final String 
messageKey, final Object[] args) {
+    return getAttributeString(DEFAULT_BUNDLE_NAME, locale, messageKey, args);
+  }
+
+  public static String getBodyString(final Locale locale, final String 
messageKey, final Object[] args) {
+    return getBodyString(DEFAULT_BUNDLE_NAME, locale, messageKey, args);
+  }
+
+  public static String getAttributeJavascriptString(final Locale locale, final 
String messageKey, final Object[] args) {
+    return getAttributeJavascriptString(DEFAULT_BUNDLE_NAME, locale, 
messageKey, args);
+  }
+
+  public static String getBodyJavascriptString(final Locale locale, final 
String messageKey, final Object[] args) {
+    return getBodyJavascriptString(DEFAULT_BUNDLE_NAME, locale, messageKey, 
args);
+  }
+
+  // More general methods which allow bundlenames and class loaders to be
+  // specified.
+
+  public static String getString(final String bundleName, final Locale locale, 
final String messageKey,
+      final Object[] args) {
+    return getString(Messages.class, bundleName, locale, messageKey, args);
+  }
+
+  public static String getAttributeString(final String bundleName, final 
Locale locale, final String messageKey,
+      final Object[] args) {
+    return getAttributeString(Messages.class, bundleName, locale, messageKey, 
args);
+  }
+
+  public static String getBodyString(final String bundleName, final Locale 
locale, final String messageKey,
+      final Object[] args) {
+    return getBodyString(Messages.class, bundleName, locale, messageKey, args);
+  }
+
+  public static String getAttributeJavascriptString(final String bundleName, 
final Locale locale,
+      final String messageKey, final Object[] args) {
+    return getAttributeJavascriptString(Messages.class, bundleName, locale, 
messageKey, args);
+  }
+
+  public static String getBodyJavascriptString(final String bundleName, final 
Locale locale, final String messageKey,
+      final Object[] args) {
+    return getBodyJavascriptString(Messages.class, bundleName, locale, 
messageKey, args);
+  }
+
+  // Resource output
+
+  public static void outputResource(final IHTTPOutput output, final Locale 
locale, final String resourceKey,
+      final Map<String, String> substitutionParameters, final boolean 
mapToUpperCase) throws ManifoldCFException {
+    outputResource(output, Messages.class, DEFAULT_PATH_NAME, locale, 
resourceKey, substitutionParameters,
+        mapToUpperCase);
+  }
+
+  public static void outputResourceWithVelocity(final IHTTPOutput output, 
final Locale locale, final String resourceKey,
+      final Map<String, String> substitutionParameters, final boolean 
mapToUpperCase) throws ManifoldCFException {
+    outputResourceWithVelocity(output, Messages.class, DEFAULT_BUNDLE_NAME, 
DEFAULT_PATH_NAME, locale, resourceKey,
+        substitutionParameters, mapToUpperCase);
+  }
+
+  public static void outputResourceWithVelocity(final IHTTPOutput output, 
final Locale locale, final String resourceKey,
+      final Map<String, Object> contextObjects) throws ManifoldCFException {
+    outputResourceWithVelocity(output, Messages.class, DEFAULT_BUNDLE_NAME, 
DEFAULT_PATH_NAME, locale, resourceKey,
+        contextObjects);
+  }
+
+}

Added: 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/HtmlExtractorException.java
URL: 
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/HtmlExtractorException.java?rev=1827009&view=auto
==============================================================================
--- 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/HtmlExtractorException.java
 (added)
+++ 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/HtmlExtractorException.java
 Fri Mar 16 16:31:26 2018
@@ -0,0 +1,19 @@
+package com.francelabs.datafari.htmlextractor.exception;
+
+
+public class HtmlExtractorException extends Exception {
+
+  /**
+   *
+   */
+  private static final long serialVersionUID = 1L;
+
+  public HtmlExtractorException(final String message) {
+    super(message);
+  }
+
+  public HtmlExtractorException(final String message, final Exception e) {
+    super(message, e);
+  }
+
+}

Added: 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/RegexException.java
URL: 
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/RegexException.java?rev=1827009&view=auto
==============================================================================
--- 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/RegexException.java
 (added)
+++ 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/exception/RegexException.java
 Fri Mar 16 16:31:26 2018
@@ -0,0 +1,27 @@
+package com.francelabs.datafari.htmlextractor.exception;
+
+
+public class RegexException extends Exception {
+
+  private String regex = "";
+
+  /**
+   *
+   */
+  private static final long serialVersionUID = 1L;
+
+  public RegexException(final String regex, final String message) {
+    super(message);
+    this.regex = regex;
+  }
+
+  public RegexException(final String regex, final String message, final 
Exception e) {
+    super(message, e);
+    this.regex = regex;
+  }
+
+  public String getRegex() {
+    return regex;
+  }
+
+}

Added: 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_en_US.properties
URL: 
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_en_US.properties?rev=1827009&view=auto
==============================================================================
--- 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_en_US.properties
 (added)
+++ 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_en_US.properties
 Fri Mar 16 16:31:26 2018
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName=HTML 
Extractor
+DatafariHtmlExtractorTransformationConnector.SolrUpdateHandler=Update handler:
+DatafariHtmlExtractorTransformationConnector.IncludeFilters=Englobing tag :
+DatafariHtmlExtractorTransformationConnector.ExcludeFilters=Tags to remove :
+DatafariHtmlExtractorTransformationConnector.RegularExpression=CSS selector
+DatafariHtmlExtractorTransformationConnector.Delete=Delete
+DatafariHtmlExtractorTransformationConnector.DeleteIncludeFilter=Delete 
englobing tag
+DatafariHtmlExtractorTransformationConnector.DeleteExcludeFilter=Delete 
blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoIncludeFilterSpecified=No 
englobing tag specified
+DatafariHtmlExtractorTransformationConnector.NoExcludeFilterSpecified=No 
blacklist tag specified
+DatafariHtmlExtractorTransformationConnector.Add=Add
+DatafariHtmlExtractorTransformationConnector.AddIncludeFilter=Add englobing tag
+DatafariHtmlExtractorTransformationConnector.AddExcludeFilter=Add blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoRegexSpecified=No CSS selector 
specified

Added: 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_es_ES.properties
URL: 
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_es_ES.properties?rev=1827009&view=auto
==============================================================================
--- 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_es_ES.properties
 (added)
+++ 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_es_ES.properties
 Fri Mar 16 16:31:26 2018
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName=HTML 
Extractor
+DatafariHtmlExtractorTransformationConnector.SolrUpdateHandler=Update handler:
+DatafariHtmlExtractorTransformationConnector.IncludeFilters=Englobing tag :
+DatafariHtmlExtractorTransformationConnector.ExcludeFilters=Tags to remove :
+DatafariHtmlExtractorTransformationConnector.RegularExpression=CSS selector
+DatafariHtmlExtractorTransformationConnector.Delete=Delete
+DatafariHtmlExtractorTransformationConnector.DeleteIncludeFilter=Delete 
englobing tag
+DatafariHtmlExtractorTransformationConnector.DeleteExcludeFilter=Delete 
blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoIncludeFilterSpecified=No 
englobing tag specified
+DatafariHtmlExtractorTransformationConnector.NoExcludeFilterSpecified=No 
blacklist tag specified
+DatafariHtmlExtractorTransformationConnector.Add=Add
+DatafariHtmlExtractorTransformationConnector.AddIncludeFilter=Add englobing tag
+DatafariHtmlExtractorTransformationConnector.AddExcludeFilter=Add blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoRegexSpecified=No CSS selector 
specified

Added: 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_ja_JP.properties
URL: 
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_ja_JP.properties?rev=1827009&view=auto
==============================================================================
--- 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_ja_JP.properties
 (added)
+++ 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_ja_JP.properties
 Fri Mar 16 16:31:26 2018
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName=HTML 
Extractor
+DatafariHtmlExtractorTransformationConnector.SolrUpdateHandler=Update handler:
+DatafariHtmlExtractorTransformationConnector.IncludeFilters=Englobing tag :
+DatafariHtmlExtractorTransformationConnector.ExcludeFilters=Tags to remove :
+DatafariHtmlExtractorTransformationConnector.RegularExpression=CSS selector
+DatafariHtmlExtractorTransformationConnector.Delete=Delete
+DatafariHtmlExtractorTransformationConnector.DeleteIncludeFilter=Delete 
englobing tag
+DatafariHtmlExtractorTransformationConnector.DeleteExcludeFilter=Delete 
blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoIncludeFilterSpecified=No 
englobing tag specified
+DatafariHtmlExtractorTransformationConnector.NoExcludeFilterSpecified=No 
blacklist tag specified
+DatafariHtmlExtractorTransformationConnector.Add=Add
+DatafariHtmlExtractorTransformationConnector.AddIncludeFilter=Add englobing tag
+DatafariHtmlExtractorTransformationConnector.AddExcludeFilter=Add blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoRegexSpecified=No CSS selector 
specified

Added: 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_zh_CN.properties
URL: 
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_zh_CN.properties?rev=1827009&view=auto
==============================================================================
--- 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_zh_CN.properties
 (added)
+++ 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/htmlextractor/common_zh_CN.properties
 Fri Mar 16 16:31:26 2018
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName=HTML 
Extractor
+DatafariHtmlExtractorTransformationConnector.SolrUpdateHandler=Update handler:
+DatafariHtmlExtractorTransformationConnector.IncludeFilters=Englobing tag :
+DatafariHtmlExtractorTransformationConnector.ExcludeFilters=Tags to remove :
+DatafariHtmlExtractorTransformationConnector.RegularExpression=CSS selector
+DatafariHtmlExtractorTransformationConnector.Delete=Delete
+DatafariHtmlExtractorTransformationConnector.DeleteIncludeFilter=Delete 
englobing tag
+DatafariHtmlExtractorTransformationConnector.DeleteExcludeFilter=Delete 
blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoIncludeFilterSpecified=No 
englobing tag specified
+DatafariHtmlExtractorTransformationConnector.NoExcludeFilterSpecified=No 
blacklist tag specified
+DatafariHtmlExtractorTransformationConnector.Add=Add
+DatafariHtmlExtractorTransformationConnector.AddIncludeFilter=Add englobing tag
+DatafariHtmlExtractorTransformationConnector.AddExcludeFilter=Add blacklist tag
+DatafariHtmlExtractorTransformationConnector.NoRegexSpecified=No CSS selector 
specified

Added: 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editConfiguration.js
URL: 
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editConfiguration.js?rev=1827009&view=auto
==============================================================================
--- 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editConfiguration.js
 (added)
+++ 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editConfiguration.js
 Fri Mar 16 16:31:26 2018
@@ -0,0 +1,27 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<script type="text/javascript">
+<!--
+
+function checkConfig()
+{
+  return true;
+}
+
+//-->
+</script>

Added: 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification.js
URL: 
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification.js?rev=1827009&view=auto
==============================================================================
--- 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification.js
 (added)
+++ 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification.js
 Fri Mar 16 16:31:26 2018
@@ -0,0 +1,76 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<script type="text/javascript">
+<!--
+function s${SEQNUM}_checkSpecification()
+{
+  return true;
+}
+
+function s${SEQNUM}_addIncludeFilter()
+{
+  if (editjob.s${SEQNUM}_includefilter_regex.value == "")
+  {
+    
alert("$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.NoRegexSpecified'))");
+    editjob.s${SEQNUM}_includefilter_regex.focus();
+    return;
+  }
+  editjob.s${SEQNUM}_includefilter_op.value="Add";
+  postFormSetAnchor("s${SEQNUM}_includefilter");
+}
+
+function s${SEQNUM}_addExcludeFilter()
+{
+  if (editjob.s${SEQNUM}_excludefilter_regex.value == "")
+  {
+    
alert("$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.NoRegexSpecified'))");
+    editjob.s${SEQNUM}_excludefilter_regex.focus();
+    return;
+  }
+  editjob.s${SEQNUM}_excludefilter_op.value="Add";
+  postFormSetAnchor("s${SEQNUM}_excludefilter");
+}
+
+function s${SEQNUM}_deleteIncludeFilter(i)
+{
+  // Set the operation
+  eval("editjob.s${SEQNUM}_includefilter_op_"+i+".value=\"Delete\"");
+  // Submit
+  if (editjob.s${SEQNUM}_includefilter_count.value==i)
+    postFormSetAnchor("s${SEQNUM}_includefilter");
+  else
+    postFormSetAnchor("s${SEQNUM}_includefilter_"+i)
+  // Undo, so we won't get two deletes next time
+  eval("editjob.s${SEQNUM}_includefilter_op_"+i+".value=\"Continue\"");
+}
+
+function s${SEQNUM}_deleteExcludeFilter(i)
+{
+  // Set the operation
+  eval("editjob.s${SEQNUM}_excludefilter_op_"+i+".value=\"Delete\"");
+  // Submit
+  if (editjob.s${SEQNUM}_excludefilter_count.value==i)
+    postFormSetAnchor("s${SEQNUM}_excludefilter");
+  else
+    postFormSetAnchor("s${SEQNUM}_excludefilter_"+i)
+  // Undo, so we won't get two deletes next time
+  eval("editjob.s${SEQNUM}_excludefilter_op_"+i+".value=\"Continue\"");
+}
+
+//-->
+</script>

Added: 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification_HTML_Extractor.html
URL: 
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification_HTML_Extractor.html?rev=1827009&view=auto
==============================================================================
--- 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification_HTML_Extractor.html
 (added)
+++ 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/editSpecification_HTML_Extractor.html
 Fri Mar 16 16:31:26 2018
@@ -0,0 +1,148 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+#if($TABNAME == 
$ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName')
 && ${SEQNUM} == ${SELECTEDNUM})
+
+<table class="displaytable">
+<tr>
+    <td class="description">
+      
<nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.IncludeFilters'))</nobr>
+    </td>
+    <td class="boxcell">
+      <table class="formtable">
+        <tr class="formheaderrow">
+          <td class="formcolumnheader"></td>
+          <td 
class="formcolumnheader"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.RegularExpression'))</nobr></td>
+        </tr>
+
+  #set($includecounter = 0)
+  #foreach($includefilter in $INCLUDEFILTERS)
+  #if(($includecounter) < 2)
+    #set($includecounterdisplay = $includecounter + 1)
+    #if(($includecounter % 2) == 0)
+        <tr class="evenformrow">
+    #else
+        <tr class="oddformrow">
+    #end
+          <td class="formcolumncell">
+            <a name="s${SEQNUM}_includefilter_$includecounter">
+              <input type="button" 
value="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.Delete'))"
 
alt="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.DeleteIncludeFilter'))$includecounterdisplay"
 onclick='javascript:s${SEQNUM}_deleteIncludeFilter("$includecounter");'/>
+              <input type="hidden" 
name="s${SEQNUM}_includefilter_op_$includecounter" value="Continue"/>
+            </a>
+          </td>
+          <td class="formcolumncell">
+            <nobr>$Encoder.bodyEscape($includefilter)</nobr>
+          </td>
+        </tr>
+    #set($includecounter = $includecounter + 1)
+  #end
+  #end
+  
+  
+  #if($includecounter == 0)
+        <tr class="formrow"><td class="formmessage" 
colspan="3">$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.NoIncludeFilterSpecified'))</td></tr>
+  #end
+      
+        <tr class="formrow"><td class="formseparator" 
colspan="3"><hr/></td></tr>
+        <tr class="formrow">
+          <td class="formcolumncell">
+            <a name="includefilter">
+              <input type="button" 
value="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.Add'))"
 
alt="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.AddIncludeFilter'))"
 onclick="javascript:s${SEQNUM}_addIncludeFilter();"/>
+            </a>
+            <input type="hidden" name="s${SEQNUM}_includefilter_count" 
value="$includecounter"/>
+            <input type="hidden" name="s${SEQNUM}_includefilter_op" 
value="Continue"/>
+          </td>
+          <td class="formcolumncell">
+            <nobr><input type="text" size="15" 
name="s${SEQNUM}_includefilter_regex" value=""/></nobr>
+          </td>
+        </tr>
+      </table>
+    </td>
+  </tr>
+  <tr>
+    <td class="description">
+      
<nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.ExcludeFilters'))</nobr>
+    </td>
+    <td class="boxcell">
+      <table class="formtable">
+        <tr class="formheaderrow">
+          <td class="formcolumnheader"></td>
+          <td 
class="formcolumnheader"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.RegularExpression'))</nobr></td>
+        </tr>
+
+  #set($excludecounter = 0)
+  #foreach($excludefilter in $EXCLUDEFILTERS)
+    #set($excludecounterdisplay = $excludecounter + 1)
+    #if(($excludecounter % 2) == 0)
+        <tr class="evenformrow">
+    #else
+        <tr class="oddformrow">
+    #end
+          <td class="formcolumncell">
+            <a name="s${SEQNUM}_excludefilter_$excludecounter">
+              <input type="button" 
value="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.Delete'))"
 
alt="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.DeleteExcludeFilter'))$excludecounterdisplay"
 onclick='javascript:s${SEQNUM}_deleteExcludeFilter("$excludecounter");'/>
+              <input type="hidden" 
name="s${SEQNUM}_excludefilter_op_$excludecounter" value="Continue"/>
+              <input type="hidden" 
name="s${SEQNUM}_excludefilter_regex_$excludecounter" 
value="$Encoder.attributeEscape($excludefilter)"/>
+            </a>
+          </td>
+          <td class="formcolumncell">
+            <nobr>$Encoder.bodyEscape($excludefilter)</nobr>
+          </td>
+        </tr>
+    #set($excludecounter = $excludecounter + 1)
+  #end
+  
+  #if($excludecounter == 0)
+        <tr class="formrow"><td class="formmessage" 
colspan="3">$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.NoExcludeFilterSpecified'))</td></tr>
+  #end
+      
+        <tr class="formrow"><td class="formseparator" 
colspan="3"><hr/></td></tr>
+        <tr class="formrow">
+          <td class="formcolumncell">
+            <a name="excludefilter">
+              <input type="button" 
value="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.Add'))"
 
alt="$Encoder.attributeEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.AddExcludeFilter'))"
 onclick="javascript:s${SEQNUM}_addExcludeFilter();"/>
+            </a>
+            <input type="hidden" name="s${SEQNUM}_excludefilter_count" 
value="$excludecounter"/>
+            <input type="hidden" name="s${SEQNUM}_excludefilter_op" 
value="Continue"/>
+          </td>
+          <td class="formcolumncell">
+            <nobr><input type="text" size="15" 
name="s${SEQNUM}_excludefilter_regex" value=""/></nobr>
+          </td>
+        </tr>
+      </table>
+    </td>
+  </tr>
+</table>
+
+#else
+
+  #set($includecounter = 0)
+  #foreach($includefilter in $INCLUDEFILTERS)
+<input type="hidden" name="s${SEQNUM}_includefilter_regex_$includecounter" 
value="$Encoder.attributeEscape($includefilter)"/>
+    #set($includecounter = $includecounter + 1)
+  #end
+<input type="hidden" name="s${SEQNUM}_includefilter_count" 
value="$includecounter"/>
+
+  #set($excludecounter = 0)
+  #foreach($excludefilter in $EXCLUDEFILTERS)
+<input type="hidden" name="s${SEQNUM}_excludefilter_regex_$excludecounter" 
value="$Encoder.attributeEscape($excludefilter)"/>
+    #set($excludecounter = $excludecounter + 1)
+  #end
+<input type="hidden" name="s${SEQNUM}_excludefilter_count" 
value="$excludecounter"/>
+
+
+#end

Added: 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewConfiguration.html
URL: 
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewConfiguration.html?rev=1827009&view=auto
==============================================================================
--- 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewConfiguration.html
 (added)
+++ 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewConfiguration.html
 Fri Mar 16 16:31:26 2018
@@ -0,0 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<table class="displaytable">
+  
+</table>

Added: 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewSpecification.html
URL: 
http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewSpecification.html?rev=1827009&view=auto
==============================================================================
--- 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewSpecification.html
 (added)
+++ 
manifoldcf/branches/CONNECTORS-1500/connectors/datafari-html-extractor-connector/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/htmlextractor/viewSpecification.html
 Fri Mar 16 16:31:26 2018
@@ -0,0 +1,82 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+<table class="displaytable">
+ <tr>
+ 
+  <tr>
+    <td class="description">
+      
<nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName'))
 
$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.IncludeFilters'))</nobr>
+    </td>
+    <td class="boxcell">
+      <table class="formtable">
+        <tr class="formheaderrow">
+          <td 
class="formcolumnheader"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.RegularExpression'))</nobr></td>
+        </tr>
+
+  #set($includecounter = 0)
+  #foreach($includefilter in $INCLUDEFILTERS)
+    #if(($includecounter % 2) == 0)
+        <tr class="evenformrow">
+    #else
+        <tr class="oddformrow">
+    #end
+          <td class="formcolumncell">
+            <nobr>$Encoder.bodyEscape($includefilter)</nobr>
+          </td>
+        </tr>
+    #set($includecounter = $includecounter + 1)
+  #end
+  
+  #if($includecounter == 0)
+        <tr class="formrow"><td class="formmessage" 
colspan="3">$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.NoIncludeFilterSpecified'))</td></tr>
+  #end
+      </table>
+    </td>
+  </tr>
+  <tr>
+    <td class="description">
+      
<nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.HtmlExtractorTabName'))
 
$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.ExcludeFilters'))</nobr>
+    </td>
+    <td class="boxcell">
+      <table class="formtable">
+        <tr class="formheaderrow">
+          <td 
class="formcolumnheader"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.RegularExpression'))</nobr></td>
+        </tr>
+
+  #set($excludecounter = 0)
+  #foreach($excludefilter in $EXCLUDEFILTERS)
+    #if(($excludecounter % 2) == 0)
+        <tr class="evenformrow">
+    #else
+        <tr class="oddformrow">
+    #end
+          <td class="formcolumncell">
+            <nobr>$Encoder.bodyEscape($excludefilter)</nobr>
+          </td>
+        </tr>
+    #set($excludecounter = $excludecounter + 1)
+  #end
+  
+  #if($excludecounter == 0)
+        <tr class="formrow"><td class="formmessage" 
colspan="3">$Encoder.bodyEscape($ResourceBundle.getString('DatafariHtmlExtractorTransformationConnector.NoExcludeFilterSpecified'))</td></tr>
+  #end
+      </table>
+    </td>
+  </tr>
+</table>


Reply via email to