This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new eeccee7  TIKA-3287 -- many thanks to Nick DiPiazza!
eeccee7 is described below

commit eeccee7593423853725992106cf7b574cb5a6de8
Author: tballison <[email protected]>
AuthorDate: Fri Jan 29 14:22:09 2021 -0500

    TIKA-3287 -- many thanks to Nick DiPiazza!
---
 .../tika/pipes/fetcher/SimpleUrlFetcher.java       |  72 ----
 .../apache/tika/pipes/fetcher/UrlFetcherTest.java  |  51 ---
 tika-pipes/tika-emitters/tika-emitter-fs/pom.xml   |  65 +++
 .../tika/pipes/emitter/fs/FileSystemEmitter.java   |  43 +-
 tika-pipes/tika-emitters/tika-emitter-s3/pom.xml   |  66 ++++
 tika-pipes/tika-emitters/tika-emitter-solr/pom.xml |  65 +++
 .../tika/pipes/emitter/solr/SolrEmitter.java       |  36 +-
 .../tika-fetch-iterator-csv/pom.xml                |  61 +++
 .../tika-fetch-iterator-jdbc/pom.xml               |  65 +++
 .../tika-fetch-iterator-s3/pom.xml                 |  65 +++
 tika-pipes/tika-fetchers/pom.xml                   |   1 +
 .../{tika-fetcher-s3 => tika-fetcher-http}/pom.xml |  45 +--
 .../tika/pipes/fetcher/http/HttpFetcher.java       | 155 ++++++++
 .../tika/pipes/fetcher/http/HttpFetcherTest.java   |  63 +++
 .../src/test/resources/tika-config-http.xml        |  26 ++
 tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml   |   5 -
 .../org/apache/tika/client/HttpClientFactory.java  | 436 +++++++++++++++++++++
 .../org/apache/tika/client/HttpClientUtil.java     |   9 +-
 tika-server/tika-server-client/pom.xml             |  68 ++++
 .../tika/server/client/TikaClientConfig.java       |  85 ----
 .../server/client/TikaClientConfigException.java   |   1 +
 21 files changed, 1222 insertions(+), 261 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/SimpleUrlFetcher.java 
b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/SimpleUrlFetcher.java
deleted file mode 100644
index 276d2f4..0000000
--- 
a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/SimpleUrlFetcher.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.pipes.fetcher;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.HttpHeaders;
-import org.apache.tika.metadata.Metadata;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.net.URL;
-import java.net.URLConnection;
-import java.util.Collections;
-import java.util.Set;
-
-/**
- * This is a lightweight fetcher that uses Java's
- * {@link URL#openStream()}. Please consider a more
- * robust way to fetch URLs, e.g. Apache httpcomponents,
- * curl or wget...
- *
- * This is limited to http: and https: urls.  This does
- * not support the file:/// protocol.  See {@link FileSystemFetcher}.
- */
-public class SimpleUrlFetcher extends AbstractFetcher {
-
-    private static String NAME = "url";
-
-    public SimpleUrlFetcher() {
-        super(NAME);
-    }
-
-
-    @Override
-    public InputStream fetch(String fetchKey, Metadata metadata)
-            throws IOException, TikaException {
-        URL url = new URL(fetchKey);
-        if (! url.getProtocol().equals("http") &&
-                ! url.getProtocol().equals("https") &&
-                        ! url.getProtocol().equals("ftp")) {
-            throw new TikaException("This fetcher only handles: http, https; 
NOT: "
-                    + url.getProtocol());
-        }
-        return TikaInputStream.get(url, metadata);
-    }
-
-    public InputStream fetch(String fetchKey, long startRange, long endRange, 
Metadata metadata)
-            throws IOException, TikaException {
-        URL url = new URL(fetchKey);
-        URLConnection connection = url.openConnection();
-        connection.setRequestProperty("Range", 
"bytes="+startRange+"-"+endRange);
-        metadata.set(HttpHeaders.CONTENT_LENGTH, 
Long.toString(endRange-startRange+1));
-        TikaInputStream tis = TikaInputStream.get(connection.getInputStream());
-        tis.getPath();
-        return tis;
-    }
-}
diff --git 
a/tika-core/src/test/java/org/apache/tika/pipes/fetcher/UrlFetcherTest.java 
b/tika-core/src/test/java/org/apache/tika/pipes/fetcher/UrlFetcherTest.java
deleted file mode 100644
index 7fb96bf..0000000
--- a/tika-core/src/test/java/org/apache/tika/pipes/fetcher/UrlFetcherTest.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.pipes.fetcher;
-
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.metadata.Metadata;
-import org.junit.Ignore;
-import org.junit.Test;
-
-import java.io.InputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.StandardCopyOption;
-import java.util.zip.GZIPInputStream;
-
-import static org.junit.Assert.assertEquals;
-
-@Ignore("requires network connectivity")
-public class UrlFetcherTest {
-
-    @Test
-    public void testRange() throws Exception {
-        String url =
-                
"https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2020-45/segments/1603107869785.9/warc/CC-MAIN-20201020021700-20201020051700-00529.warc.gz";;
-        long start = 969596307;
-        long end = start + 1408 - 1;
-        Metadata metadata = new Metadata();
-
-        try (TemporaryResources tmp = new TemporaryResources()) {
-            Path tmpPath = tmp.createTempFile();
-            try (InputStream is = new SimpleUrlFetcher().fetch(url, start, 
end, metadata)) {
-                Files.copy(new GZIPInputStream(is), tmpPath, 
StandardCopyOption.REPLACE_EXISTING);
-            }
-            assertEquals(2461, Files.size(tmpPath));
-        }
-    }
-}
diff --git a/tika-pipes/tika-emitters/tika-emitter-fs/pom.xml 
b/tika-pipes/tika-emitters/tika-emitter-fs/pom.xml
index 6c5c2cf..cf214da 100644
--- a/tika-pipes/tika-emitters/tika-emitter-fs/pom.xml
+++ b/tika-pipes/tika-emitters/tika-emitter-fs/pom.xml
@@ -45,4 +45,69 @@
         </dependency>
     </dependencies>
 
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-jar-plugin</artifactId>
+                <configuration>
+                    <archive>
+                        <manifestEntries>
+                            
<Automatic-Module-Name>org.apache.tika.pipes.emitter.fs</Automatic-Module-Name>
+                        </manifestEntries>
+                    </archive>
+                </configuration>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>test-jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-shade-plugin</artifactId>
+                <version>${maven.shade.version}</version>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <createDependencyReducedPom>
+                                false
+                            </createDependencyReducedPom>
+                            <!-- <filters> -->
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/*</exclude>
+                                        <exclude>LICENSE.txt</exclude>
+                                        <exclude>NOTICE.txt</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                            <transformers>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/LICENSE</resource>
+                                    
<file>target/classes/META-INF/LICENSE</file>
+                                </transformer>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/NOTICE</resource>
+                                    <file>target/classes/META-INF/NOTICE</file>
+                                </transformer>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/DEPENDENCIES</resource>
+                                    
<file>target/classes/META-INF/DEPENDENCIES</file>
+                                </transformer>
+                            </transformers>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+
+        </plugins>
+    </build>
 </project>
\ No newline at end of file
diff --git 
a/tika-pipes/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
 
b/tika-pipes/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
index 9398096..f58a21c 100644
--- 
a/tika-pipes/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
+++ 
b/tika-pipes/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
@@ -30,9 +30,11 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.Writer;
 import java.nio.charset.StandardCharsets;
+import java.nio.file.FileAlreadyExistsException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
 import java.util.Collections;
 import java.util.List;
 import java.util.Set;
@@ -54,6 +56,9 @@ import java.util.Set;
  *                  &lt;param name="basePath" 
type="string"&gt;/path/to/output&lt;/param&gt;
  *                  &lt;!-- optional; default is 'json' --&gt;
  *                  &lt;param name="fileExtension" 
type="string"&gt;json&lt;/param&gt;
+ *                  &lt;!-- optional; if the file already exists, options 
('skip', 'replace', 'exception')
+ *                  default is 'exception' --&gt;
+ *                  &lt;param name="onExists" 
type="string"&gt;skip&lt;/param&gt;
  *              &lt;/params&gt;
  *          &lt;/emitter&gt;
  *      &lt;/emitters&gt;
@@ -61,8 +66,13 @@ import java.util.Set;
  */
 public class FileSystemEmitter extends AbstractEmitter implements 
StreamEmitter {
 
+    enum ON_EXISTS {
+        SKIP, EXCEPTION, REPLACE
+    }
+
     private Path basePath = null;
     private String fileExtension = "json";
+    private ON_EXISTS onExists = ON_EXISTS.EXCEPTION;
 
 
     @Override
@@ -108,9 +118,40 @@ public class FileSystemEmitter extends AbstractEmitter 
implements StreamEmitter
         this.fileExtension = fileExtension;
     }
 
+    @Field
+    public void setOnExists(String onExists) {
+        if (onExists.equals("skip")) {
+            this.onExists = ON_EXISTS.SKIP;
+        } else if (onExists.equals("replace")) {
+            this.onExists = ON_EXISTS.REPLACE;
+        } else if (onExists.equals("exception")) {
+            this.onExists = ON_EXISTS.EXCEPTION;
+        } else {
+            throw new IllegalArgumentException(
+                    "Don't understand '" + onExists +
+                            "'; must be one of: 'skip', 'replace', 
'exception'");
+        }
+    }
     @Override
     public void emit(String path, InputStream inputStream, Metadata 
userMetadata) throws IOException,
             TikaEmitterException {
-        Files.copy(inputStream, basePath.resolve(path));
+        Path target = basePath.resolve(path);
+
+        if (!Files.isDirectory(target.getParent())) {
+            Files.createDirectories(target.getParent());
+        }
+        if (onExists == ON_EXISTS.REPLACE) {
+            Files.copy(inputStream, target, 
StandardCopyOption.REPLACE_EXISTING);
+        } else if (onExists == ON_EXISTS.EXCEPTION) {
+            Files.copy(inputStream, target);
+        } else if (onExists == ON_EXISTS.SKIP) {
+            if (!Files.isRegularFile(target)) {
+                try {
+                    Files.copy(inputStream, target);
+                } catch (FileAlreadyExistsException e) {
+                    //swallow
+                }
+            }
+        }
     }
 }
diff --git a/tika-pipes/tika-emitters/tika-emitter-s3/pom.xml 
b/tika-pipes/tika-emitters/tika-emitter-s3/pom.xml
index 61f8c77..62a79f0 100644
--- a/tika-pipes/tika-emitters/tika-emitter-s3/pom.xml
+++ b/tika-pipes/tika-emitters/tika-emitter-s3/pom.xml
@@ -82,4 +82,70 @@
             <scope>test</scope>
         </dependency>
     </dependencies>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-jar-plugin</artifactId>
+                <configuration>
+                    <archive>
+                        <manifestEntries>
+                            
<Automatic-Module-Name>org.apache.tika.pipes.emitter.s3</Automatic-Module-Name>
+                        </manifestEntries>
+                    </archive>
+                </configuration>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>test-jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-shade-plugin</artifactId>
+                <version>${maven.shade.version}</version>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <createDependencyReducedPom>
+                                false
+                            </createDependencyReducedPom>
+                            <!-- <filters> -->
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/*</exclude>
+                                        <exclude>LICENSE.txt</exclude>
+                                        <exclude>NOTICE.txt</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                            <transformers>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/LICENSE</resource>
+                                    
<file>target/classes/META-INF/LICENSE</file>
+                                </transformer>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/NOTICE</resource>
+                                    <file>target/classes/META-INF/NOTICE</file>
+                                </transformer>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/DEPENDENCIES</resource>
+                                    
<file>target/classes/META-INF/DEPENDENCIES</file>
+                                </transformer>
+                            </transformers>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+
+        </plugins>
+    </build>
 </project>
\ No newline at end of file
diff --git a/tika-pipes/tika-emitters/tika-emitter-solr/pom.xml 
b/tika-pipes/tika-emitters/tika-emitter-solr/pom.xml
index 6aefb89..1edfff0 100644
--- a/tika-pipes/tika-emitters/tika-emitter-solr/pom.xml
+++ b/tika-pipes/tika-emitters/tika-emitter-solr/pom.xml
@@ -52,4 +52,69 @@
         </dependency>
     </dependencies>
 
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-jar-plugin</artifactId>
+                <configuration>
+                    <archive>
+                        <manifestEntries>
+                            
<Automatic-Module-Name>org.apache.tika.pipes.emitter.solr</Automatic-Module-Name>
+                        </manifestEntries>
+                    </archive>
+                </configuration>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>test-jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-shade-plugin</artifactId>
+                <version>${maven.shade.version}</version>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <createDependencyReducedPom>
+                                false
+                            </createDependencyReducedPom>
+                            <!-- <filters> -->
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/*</exclude>
+                                        <exclude>LICENSE.txt</exclude>
+                                        <exclude>NOTICE.txt</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                            <transformers>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/LICENSE</resource>
+                                    
<file>target/classes/META-INF/LICENSE</file>
+                                </transformer>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/NOTICE</resource>
+                                    <file>target/classes/META-INF/NOTICE</file>
+                                </transformer>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/DEPENDENCIES</resource>
+                                    
<file>target/classes/META-INF/DEPENDENCIES</file>
+                                </transformer>
+                            </transformers>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+
+        </plugins>
+    </build>
 </project>
\ No newline at end of file
diff --git 
a/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
 
b/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
index 2391424..5830b14 100644
--- 
a/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
+++ 
b/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
@@ -19,6 +19,8 @@ package org.apache.tika.pipes.emitter.solr;
 import com.google.gson.Gson;
 import com.google.gson.JsonArray;
 import com.google.gson.JsonObject;
+import org.apache.http.client.HttpClient;
+import org.apache.tika.client.HttpClientFactory;
 import org.apache.tika.client.HttpClientUtil;
 import org.apache.tika.client.TikaClientException;
 import org.apache.tika.config.Field;
@@ -56,6 +58,8 @@ public class SolrEmitter extends AbstractEmitter implements 
Initializable {
     private String contentField = "content";
     private String idField = "id";
     private int commitWithin = 100;
+    private HttpClientFactory httpClientFactory;
+    private HttpClient httpClient;
 
     @Override
     public void emit(String emitKey, List<Metadata> metadataList) throws 
IOException,
@@ -67,7 +71,8 @@ public class SolrEmitter extends AbstractEmitter implements 
Initializable {
         String json = jsonify(emitKey, metadataList);
         LOG.debug("emitting json:"+json);
         try {
-            
HttpClientUtil.postJson(url+UPDATE_PATH+"?commitWithin="+getCommitWithin(), 
json);
+            HttpClientUtil.postJson(httpClient,
+                    url+UPDATE_PATH+"?commitWithin="+getCommitWithin(), json);
         } catch (TikaClientException e) {
             throw new TikaEmitterException("can't post", e);
         }
@@ -200,7 +205,6 @@ public class SolrEmitter extends AbstractEmitter implements 
Initializable {
     public int getCommitWithin() {
         return commitWithin;
     }
-    //TODO: add username/password for authentication?
 
     /**
      * Specify the field in the first Metadata that should be
@@ -213,10 +217,36 @@ public class SolrEmitter extends AbstractEmitter 
implements Initializable {
         this.idField = idField;
     }
 
+    //TODO -- add other httpclient configurations
+    @Field
+    public void setUserName(String userName) {
+        httpClientFactory.setUserName(userName);
+    }
+
+    @Field
+    public void setPassword(String password) {
+        httpClientFactory.setPassword(password);
+    }
+
+    @Field
+    public void setAuthScheme(String authScheme) {
+        httpClientFactory.setAuthScheme(authScheme);
+    }
+
+    @Field
+    public void setProxyHost(String proxyHost) {
+        httpClientFactory.setProxyHost(proxyHost);
+    }
+
+    @Field
+    public void setProxyPort(int proxyPort) {
+        httpClientFactory.setProxyPort(proxyPort);
+    }
+
     @Override
     public void initialize(Map<String, Param> params) throws 
TikaConfigException {
         //TODO: build the client here?
-
+        httpClient = httpClientFactory.build();
     }
 
     @Override
diff --git a/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-csv/pom.xml 
b/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-csv/pom.xml
index 40906df..6996366 100644
--- a/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-csv/pom.xml
+++ b/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-csv/pom.xml
@@ -63,6 +63,67 @@
                     </excludes>
                 </configuration>
             </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-jar-plugin</artifactId>
+                <configuration>
+                    <archive>
+                        <manifestEntries>
+                            
<Automatic-Module-Name>org.apache.tika.pipes.fetchiterator.csv</Automatic-Module-Name>
+                        </manifestEntries>
+                    </archive>
+                </configuration>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>test-jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-shade-plugin</artifactId>
+                <version>${maven.shade.version}</version>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <createDependencyReducedPom>
+                                false
+                            </createDependencyReducedPom>
+                            <!-- <filters> -->
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/*</exclude>
+                                        <exclude>LICENSE.txt</exclude>
+                                        <exclude>NOTICE.txt</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                            <transformers>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/LICENSE</resource>
+                                    
<file>target/classes/META-INF/LICENSE</file>
+                                </transformer>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/NOTICE</resource>
+                                    <file>target/classes/META-INF/NOTICE</file>
+                                </transformer>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/DEPENDENCIES</resource>
+                                    
<file>target/classes/META-INF/DEPENDENCIES</file>
+                                </transformer>
+                            </transformers>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+
         </plugins>
     </build>
 </project>
diff --git a/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-jdbc/pom.xml 
b/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-jdbc/pom.xml
index 14cd6f3..2813ed4 100644
--- a/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-jdbc/pom.xml
+++ b/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-jdbc/pom.xml
@@ -52,4 +52,69 @@
             <scope>test</scope>
         </dependency>
     </dependencies>
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-jar-plugin</artifactId>
+                <configuration>
+                    <archive>
+                        <manifestEntries>
+                            
<Automatic-Module-Name>org.apache.tika.pipes.fetchiterator.jdbc</Automatic-Module-Name>
+                        </manifestEntries>
+                    </archive>
+                </configuration>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>test-jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-shade-plugin</artifactId>
+                <version>${maven.shade.version}</version>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <createDependencyReducedPom>
+                                false
+                            </createDependencyReducedPom>
+                            <!-- <filters> -->
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/*</exclude>
+                                        <exclude>LICENSE.txt</exclude>
+                                        <exclude>NOTICE.txt</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                            <transformers>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/LICENSE</resource>
+                                    
<file>target/classes/META-INF/LICENSE</file>
+                                </transformer>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/NOTICE</resource>
+                                    <file>target/classes/META-INF/NOTICE</file>
+                                </transformer>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/DEPENDENCIES</resource>
+                                    
<file>target/classes/META-INF/DEPENDENCIES</file>
+                                </transformer>
+                            </transformers>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+
+        </plugins>
+    </build>
 </project>
diff --git a/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-s3/pom.xml 
b/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-s3/pom.xml
index 85ee53c..f8b2424 100644
--- a/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-s3/pom.xml
+++ b/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-s3/pom.xml
@@ -79,4 +79,69 @@
             <scope>test</scope>
         </dependency>
     </dependencies>
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-jar-plugin</artifactId>
+                <configuration>
+                    <archive>
+                        <manifestEntries>
+                            
<Automatic-Module-Name>org.apache.tika.pipes.fetchiterator.s3</Automatic-Module-Name>
+                        </manifestEntries>
+                    </archive>
+                </configuration>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>test-jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-shade-plugin</artifactId>
+                <version>${maven.shade.version}</version>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <createDependencyReducedPom>
+                                false
+                            </createDependencyReducedPom>
+                            <!-- <filters> -->
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/*</exclude>
+                                        <exclude>LICENSE.txt</exclude>
+                                        <exclude>NOTICE.txt</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                            <transformers>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/LICENSE</resource>
+                                    
<file>target/classes/META-INF/LICENSE</file>
+                                </transformer>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/NOTICE</resource>
+                                    <file>target/classes/META-INF/NOTICE</file>
+                                </transformer>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/DEPENDENCIES</resource>
+                                    
<file>target/classes/META-INF/DEPENDENCIES</file>
+                                </transformer>
+                            </transformers>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+
+        </plugins>
+    </build>
 </project>
diff --git a/tika-pipes/tika-fetchers/pom.xml b/tika-pipes/tika-fetchers/pom.xml
index 597c76b..d541a49 100644
--- a/tika-pipes/tika-fetchers/pom.xml
+++ b/tika-pipes/tika-fetchers/pom.xml
@@ -34,6 +34,7 @@
     <url>http://tika.apache.org/</url>
 
     <modules>
+        <module>tika-fetcher-http</module>
         <module>tika-fetcher-s3</module>
     </modules>
 
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml 
b/tika-pipes/tika-fetchers/tika-fetcher-http/pom.xml
similarity index 74%
copy from tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml
copy to tika-pipes/tika-fetchers/tika-fetcher-http/pom.xml
index 55afa8d..09dcb97 100644
--- a/tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml
+++ b/tika-pipes/tika-fetchers/tika-fetcher-http/pom.xml
@@ -27,40 +27,10 @@
     </parent>
     <modelVersion>4.0.0</modelVersion>
 
-    <artifactId>tika-fetcher-s3</artifactId>
+    <artifactId>tika-fetcher-http</artifactId>
 
     <dependencies>
         <dependency>
-            <groupId>com.amazonaws</groupId>
-            <artifactId>aws-java-sdk-s3</artifactId>
-            <version>${aws.version}</version>
-            <exclusions>
-                <exclusion>
-                    <groupId>commons-logging</groupId>
-                    <artifactId>commons-logging</artifactId>
-                </exclusion>
-                <exclusion>
-                    <groupId>com.fasterxml.jackson.core</groupId>
-                    <artifactId>jackson-core</artifactId>
-                </exclusion>
-                <exclusion>
-                    <groupId>com.fasterxml.jackson.core</groupId>
-                    <artifactId>jackson-databind</artifactId>
-                </exclusion>
-            </exclusions>
-        </dependency>
-
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-databind</artifactId>
-            <version>${jackson.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>commons-logging</groupId>
-            <artifactId>commons-logging</artifactId>
-            <version>${commons.logging.version}</version>
-        </dependency>
-        <dependency>
             <groupId>org.slf4j</groupId>
             <artifactId>slf4j-log4j12</artifactId>
         </dependency>
@@ -71,6 +41,11 @@
             <scope>provided</scope>
         </dependency>
         <dependency>
+            <groupId>${project.groupId}</groupId>
+            <artifactId>tika-httpclient-commons</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
             <groupId>junit</groupId>
             <artifactId>junit</artifactId>
             <scope>test</scope>
@@ -85,7 +60,7 @@
                 <configuration>
                     <archive>
                         <manifestEntries>
-                            
<Automatic-Module-Name>org.apache.tika.pipes.fetcher.s3</Automatic-Module-Name>
+                            
<Automatic-Module-Name>org.apache.tika.pipes.fetcher.http</Automatic-Module-Name>
                         </manifestEntries>
                     </archive>
                 </configuration>
@@ -122,11 +97,6 @@
                                 </filter>
                             </filters>
                             <transformers>
-                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
-                                    
<mainClass>org.apache.tika.eval.app.TikaEvalCLI</mainClass>
-                                </transformer>
-
-                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"
 />
                                 <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
                                     <resource>META-INF/LICENSE</resource>
                                     
<file>target/classes/META-INF/LICENSE</file>
@@ -144,7 +114,6 @@
                     </execution>
                 </executions>
             </plugin>
-
         </plugins>
     </build>
 </project>
\ No newline at end of file
diff --git 
a/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java
 
b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java
new file mode 100644
index 0000000..75bd1a1
--- /dev/null
+++ 
b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.fetcher.http;
+
+
+import org.apache.commons.io.IOUtils;
+import org.apache.http.HttpResponse;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.tika.client.HttpClientFactory;
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.pipes.fetcher.AbstractFetcher;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Map;
+
+/**
+ * Based on Apache httpclient
+ */
+public class HttpFetcher extends AbstractFetcher implements Initializable {
+
+    Logger LOG = LoggerFactory.getLogger(HttpFetcher.class);
+    private HttpClientFactory httpClientFactory;
+    private HttpClient httpClient;
+
+    public HttpFetcher() {
+        httpClientFactory = new HttpClientFactory();
+    }
+    @Override
+    public InputStream fetch(String fetchKey, Metadata metadata)
+            throws IOException, TikaException {
+        HttpGet get = new HttpGet(fetchKey);
+        return get(get);
+    }
+
+    public InputStream fetch(String fetchKey, long startRange, long endRange, 
Metadata metadata)
+            throws IOException, TikaException {
+        HttpGet get = new HttpGet(fetchKey);
+        get.setHeader("Range", "bytes="+startRange+"-"+endRange);
+        return get(get);
+    }
+
+    private InputStream get(HttpGet get) throws IOException, TikaException {
+        HttpResponse response = httpClient.execute(get);
+        int code = response.getStatusLine().getStatusCode();
+        if (code < 200 || code > 299) {
+            throw new IOException("bad status code: "+
+                    code
+                    + " :: " +
+                    responseToString(response.getEntity().getContent()));
+        }
+
+        //spool to local
+        long start = System.currentTimeMillis();
+        TikaInputStream tis = TikaInputStream.get(
+                response.getEntity().getContent());
+        tis.getPath();
+        if (response instanceof CloseableHttpResponse) {
+            ((CloseableHttpResponse) response).close();
+        }
+        long elapsed = System.currentTimeMillis()-start;
+        LOG.debug("took {} ms to copy to local tmp file", elapsed);
+        return tis;
+    }
+
+    private String responseToString(InputStream is) {
+        try {
+            return IOUtils.toString(is, StandardCharsets.UTF_8);
+        } catch (IOException e) {
+            LOG.warn("IOexception trying to read error message", e);
+            return "";
+        }
+    }
+
+    @Field
+    public void setUserName(String userName) {
+        httpClientFactory.setUserName(userName);
+    }
+
+    @Field
+    public void setPassword(String password) {
+        httpClientFactory.setPassword(password);
+    }
+
+    @Field
+    public void setNtDomain(String domain) {
+        httpClientFactory.setNtDomain(domain);
+    }
+
+    @Field
+    public void setAuthScheme(String authScheme) {
+        httpClientFactory.setAuthScheme(authScheme);
+    }
+
+    @Field
+    public void setProxyHost(String proxyHost) {
+        httpClientFactory.setProxyHost(proxyHost);
+    }
+
+    @Field
+    public void setProxyPort(int proxyPort) {
+        httpClientFactory.setProxyPort(proxyPort);
+    }
+
+    @Field
+    public void setConnectTimeout(int connectTimeout) {
+        httpClientFactory.setConnectTimeout(connectTimeout);
+    }
+
+    @Field
+    public void setRequestTimeout(int requestTimeout) {
+        httpClientFactory.setRequestTimeout(requestTimeout);
+    }
+
+    @Field
+    public void setSocketTimeout(int socketTimeout) {
+        httpClientFactory.setSocketTimeout(socketTimeout);
+    }
+
+    @Override
+    public void initialize(Map<String, Param> params) throws 
TikaConfigException {
+        httpClient = httpClientFactory.build();
+    }
+
+    @Override
+    public void checkInitialization(InitializableProblemHandler 
problemHandler) throws TikaConfigException {
+
+    }
+}
diff --git 
a/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/HttpFetcherTest.java
 
b/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/HttpFetcherTest.java
new file mode 100644
index 0000000..8a2d8a1
--- /dev/null
+++ 
b/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/HttpFetcherTest.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.fetcher.http;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+import java.util.zip.GZIPInputStream;
+
+import static org.junit.Assert.assertEquals;
+
+@Ignore("requires network connectivity")
+public class HttpFetcherTest {
+
+        @Test
+        public void testRange() throws Exception {
+            String url =
+                    
"https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2020-45/segments/1603107869785.9/warc/CC-MAIN-20201020021700-20201020051700-00529.warc.gz";;
+            long start = 969596307;
+            long end = start + 1408 - 1;
+            Metadata metadata = new Metadata();
+            HttpFetcher httpFetcher = (HttpFetcher) 
getConfig("tika-config-http.xml")
+                    .getFetcherManager().getFetcher("http");
+            try (TemporaryResources tmp = new TemporaryResources()) {
+                Path tmpPath = tmp.createTempFile();
+                try (InputStream is = httpFetcher.fetch(url, start, end, 
metadata)) {
+                    Files.copy(new GZIPInputStream(is), tmpPath, 
StandardCopyOption.REPLACE_EXISTING);
+                }
+                assertEquals(2461, Files.size(tmpPath));
+            }
+        }
+
+
+    TikaConfig getConfig(String path) throws TikaException, IOException, 
SAXException {
+            return new 
TikaConfig(HttpFetcherTest.class.getResourceAsStream("/"+path));
+    }
+
+
+}
diff --git 
a/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/resources/tika-config-http.xml
 
b/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/resources/tika-config-http.xml
new file mode 100644
index 0000000..028f123
--- /dev/null
+++ 
b/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/resources/tika-config-http.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <fetchers>
+        <fetcher class="org.apache.tika.pipes.fetcher.http.HttpFetcher">
+            <params>
+                <param name="name" type="string">http</param>
+            </params>
+        </fetcher>
+    </fetchers>
+</properties>
\ No newline at end of file
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml 
b/tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml
index 55afa8d..158e701 100644
--- a/tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml
+++ b/tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml
@@ -122,11 +122,6 @@
                                 </filter>
                             </filters>
                             <transformers>
-                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
-                                    
<mainClass>org.apache.tika.eval.app.TikaEvalCLI</mainClass>
-                                </transformer>
-
-                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"
 />
                                 <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
                                     <resource>META-INF/LICENSE</resource>
                                     
<file>target/classes/META-INF/LICENSE</file>
diff --git 
a/tika-pipes/tika-httpclient-commons/src/main/java/org/apache/tika/client/HttpClientFactory.java
 
b/tika-pipes/tika-httpclient-commons/src/main/java/org/apache/tika/client/HttpClientFactory.java
new file mode 100644
index 0000000..e74dcb9
--- /dev/null
+++ 
b/tika-pipes/tika-httpclient-commons/src/main/java/org/apache/tika/client/HttpClientFactory.java
@@ -0,0 +1,436 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.client;
+
+import org.apache.http.Header;
+import org.apache.http.HeaderElement;
+import org.apache.http.HeaderElementIterator;
+import org.apache.http.HttpHost;
+import org.apache.http.HttpRequest;
+import org.apache.http.HttpResponse;
+import org.apache.http.ProtocolException;
+import org.apache.http.auth.AuthSchemeProvider;
+import org.apache.http.auth.AuthScope;
+import org.apache.http.auth.Credentials;
+import org.apache.http.auth.NTCredentials;
+import org.apache.http.auth.UsernamePasswordCredentials;
+import org.apache.http.client.CredentialsProvider;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.config.AuthSchemes;
+import org.apache.http.client.config.RequestConfig;
+import org.apache.http.config.Registry;
+import org.apache.http.config.RegistryBuilder;
+import org.apache.http.conn.ConnectionKeepAliveStrategy;
+import org.apache.http.conn.socket.ConnectionSocketFactory;
+import org.apache.http.conn.socket.PlainConnectionSocketFactory;
+import org.apache.http.conn.ssl.NoopHostnameVerifier;
+import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
+import org.apache.http.conn.ssl.TrustStrategy;
+import org.apache.http.impl.auth.BasicSchemeFactory;
+import org.apache.http.impl.auth.NTLMSchemeFactory;
+import org.apache.http.impl.client.BasicCredentialsProvider;
+import org.apache.http.impl.client.HttpClientBuilder;
+import org.apache.http.impl.client.HttpClients;
+import org.apache.http.impl.client.LaxRedirectStrategy;
+import org.apache.http.impl.conn.DefaultProxyRoutePlanner;
+import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
+import org.apache.http.message.BasicHeaderElementIterator;
+import org.apache.http.protocol.HTTP;
+import org.apache.http.protocol.HttpContext;
+import org.apache.http.ssl.SSLContexts;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.utils.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.crypto.BadPaddingException;
+import javax.crypto.Cipher;
+import javax.crypto.IllegalBlockSizeException;
+import javax.crypto.NoSuchPaddingException;
+import javax.crypto.spec.SecretKeySpec;
+import javax.net.ssl.SSLContext;
+import java.io.UnsupportedEncodingException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URLEncoder;
+import java.nio.charset.StandardCharsets;
+import java.security.InvalidKeyException;
+import java.security.KeyManagementException;
+import java.security.KeyStoreException;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.Arrays;
+import java.util.Base64;
+import java.util.HashSet;
+import java.util.Set;
+
+public class HttpClientFactory {
+
+    private static final Logger LOG = 
LoggerFactory.getLogger(HttpClientFactory.class);
+
+    private AES aes;
+
+    private String proxyHost;
+    private int proxyPort;
+    private Set<String> allowedHostsForRedirect = new HashSet<>();
+    private int maxConnectionsPerRoute = 1000;
+    private int maxConnections = 2000;
+    private int requestTimeout = 120000;
+    private int connectTimeout = 120000;
+    private int socketTimeout = 120000;
+    private int keepAliveOnBadKeepAliveValueMs = 1000;
+    private String userName;
+    private String password;
+    private String ntDomain;//if using nt credentials
+    private String authScheme = "basic"; //ntlm or basic
+    private boolean credentialsAESEncrypted = false;
+
+    public String getProxyHost() {
+        return proxyHost;
+    }
+
+    public void setProxyHost(String proxyHost) {
+        this.proxyHost = proxyHost;
+    }
+
+    public int getProxyPort() {
+        return proxyPort;
+    }
+
+    public void setProxyPort(int proxyPort) {
+        this.proxyPort = proxyPort;
+    }
+
+    public Set<String> getAllowedHostsForRedirect() {
+        return allowedHostsForRedirect;
+    }
+
+    public void setAllowedHostsForRedirect(Set<String> 
allowedHostsForRedirect) {
+        this.allowedHostsForRedirect = allowedHostsForRedirect;
+    }
+
+    public int getMaxConnectionsPerRoute() {
+        return maxConnectionsPerRoute;
+    }
+
+    public void setMaxConnectionsPerRoute(int maxConnectionsPerRoute) {
+        this.maxConnectionsPerRoute = maxConnectionsPerRoute;
+    }
+
+    public int getMaxConnections() {
+        return maxConnections;
+    }
+
+    public void setMaxConnections(int maxConnections) {
+        this.maxConnections = maxConnections;
+    }
+
+    public int getRequestTimeout() {
+        return requestTimeout;
+    }
+
+    public void setRequestTimeout(int requestTimeout) {
+        this.requestTimeout = requestTimeout;
+    }
+
+    public int getConnectTimeout() {
+        return connectTimeout;
+    }
+
+    public void setConnectTimeout(int connectTimeout) {
+        this.connectTimeout = connectTimeout;
+    }
+
+    public int getSocketTimeout() {
+        return socketTimeout;
+    }
+
+    public void setSocketTimeout(int socketTimeout) {
+        this.socketTimeout = socketTimeout;
+    }
+
+    public int getKeepAliveOnBadKeepAliveValueMs() {
+        return keepAliveOnBadKeepAliveValueMs;
+    }
+
+    public void setKeepAliveOnBadKeepAliveValueMs(int 
keepAliveOnBadKeepAliveValueMs) {
+        this.keepAliveOnBadKeepAliveValueMs = keepAliveOnBadKeepAliveValueMs;
+    }
+
+    public String getUserName() {
+        return userName;
+    }
+
+    public void setUserName(String userName) {
+        this.userName = userName;
+    }
+
+    public String getPassword() {
+        return password;
+    }
+
+    public void setPassword(String password) {
+        this.password = password;
+    }
+
+    public String getNtDomain() {
+        return ntDomain;
+    }
+
+    public void setNtDomain(String ntDomain) {
+        this.ntDomain = ntDomain;
+    }
+
+    public String getAuthScheme() {
+        return authScheme;
+    }
+
+    /**
+     * only basic and ntlm are supported
+     * @param authScheme
+     */
+    public void setAuthScheme(String authScheme) {
+        this.authScheme = authScheme;
+    }
+
+    public HttpClient build() throws TikaConfigException {
+        LOG.info("http client does not verify ssl at this point.  " +
+                "If you need that, please open a ticket.");
+        TrustStrategy acceptingTrustStrategy = (cert, authType) -> true;
+        SSLContext sslContext = null;
+        try {
+            sslContext = SSLContexts.custom().loadTrustMaterial(null,
+                    acceptingTrustStrategy).build();
+        } catch (NoSuchAlgorithmException | KeyManagementException | 
KeyStoreException e) {
+            throw new TikaConfigException("", e);
+        }
+        SSLConnectionSocketFactory sslsf = new 
SSLConnectionSocketFactory(sslContext,
+                NoopHostnameVerifier.INSTANCE);
+
+        Registry<ConnectionSocketFactory> socketFactoryRegistry =
+                RegistryBuilder.<ConnectionSocketFactory>create()
+                        .register("https", sslsf)
+                        .register("http", new PlainConnectionSocketFactory())
+                        .build();
+
+        PoolingHttpClientConnectionManager manager =
+                new PoolingHttpClientConnectionManager(socketFactoryRegistry);
+        manager.setDefaultMaxPerRoute(maxConnectionsPerRoute);
+        manager.setMaxTotal(maxConnections);
+
+        HttpClientBuilder builder = HttpClients.custom();
+        addCredentialsProvider(builder);
+        addProxy(builder);
+        return builder.setConnectionManager(manager)
+                .setRedirectStrategy(
+                        new CustomRedirectStrategy(allowedHostsForRedirect))
+                .setDefaultRequestConfig(RequestConfig.custom()
+                        
.setTargetPreferredAuthSchemes(Arrays.asList(AuthSchemes.BASIC,
+                                AuthSchemes.NTLM))
+                        .setConnectionRequestTimeout((int) requestTimeout)
+                        .setConnectionRequestTimeout(connectTimeout)
+                        .setSocketTimeout(socketTimeout)
+                        .build()
+                )
+                .setKeepAliveStrategy(getKeepAliveStrategy())
+                .setSSLSocketFactory(sslsf)
+                .setSSLHostnameVerifier(NoopHostnameVerifier.INSTANCE)
+                .build();
+    }
+
+    private void addProxy(HttpClientBuilder builder) {
+        if (!StringUtils.isBlank(proxyHost)) {
+            HttpHost proxy = new HttpHost(proxyHost, proxyPort);
+            DefaultProxyRoutePlanner proxyRoutePlanner = new 
DefaultProxyRoutePlanner(proxy);
+            builder.setRoutePlanner(proxyRoutePlanner);
+        }
+    }
+
+    private void addCredentialsProvider(HttpClientBuilder builder) throws 
TikaConfigException {
+
+        if (StringUtils.isBlank(userName) && StringUtils.isBlank(password)) {
+            return;
+        }
+
+        if ((StringUtils.isBlank(userName) && StringUtils.isBlank(password)) ||
+                (StringUtils.isBlank(password) && 
StringUtils.isBlank(userName))) {
+            throw new IllegalArgumentException("can't have one of 'username', 
" +
+                    "'password' null and the other not");
+        }
+
+        String finalUserName = decrypt(userName);
+        String finalPassword = decrypt(password);
+        String finalDomain = decrypt(ntDomain);
+            CredentialsProvider provider = new BasicCredentialsProvider();
+            Credentials credentials = null;
+            Registry<AuthSchemeProvider> authSchemeRegistry = null;
+            if (authScheme.equals("basic")) {
+                credentials = new UsernamePasswordCredentials(finalUserName, 
finalPassword);
+                authSchemeRegistry = RegistryBuilder
+                        .<AuthSchemeProvider>create()
+                        .register("basic", new BasicSchemeFactory())
+                        .build();
+            } else if (authScheme.equals("ntlm")) {
+                if (StringUtils.isBlank(ntDomain)) {
+                    throw new IllegalArgumentException("must specify 
'ntDomain'");
+                }
+                credentials = new NTCredentials(finalUserName, finalPassword, 
null, finalDomain);
+                authSchemeRegistry = 
RegistryBuilder.<AuthSchemeProvider>create()
+                        .register("ntlm", new NTLMSchemeFactory()).build();
+            }
+            provider.setCredentials(AuthScope.ANY, credentials);
+            builder.setDefaultCredentialsProvider(provider);
+            builder.setDefaultAuthSchemeRegistry(authSchemeRegistry);
+
+    }
+
+    private String decrypt(String encrypted) throws TikaConfigException {
+        if (! credentialsAESEncrypted) {
+            return encrypted;
+        }
+        if (encrypted == null) {
+            return encrypted;
+        }
+        if (aes == null) {
+            aes = new AES();
+        }
+        return aes.decrypt(encrypted);
+    }
+
+    //if there's a bad/missing keepalive strategy
+    public ConnectionKeepAliveStrategy getKeepAliveStrategy() {
+        return new ConnectionKeepAliveStrategy() {
+
+            public long getKeepAliveDuration(HttpResponse response, 
HttpContext context) {
+                // Honor 'keep-alive' header
+                HeaderElementIterator it = new BasicHeaderElementIterator(
+                        response.headerIterator(HTTP.CONN_KEEP_ALIVE));
+                while (it.hasNext()) {
+                    HeaderElement he = it.nextElement();
+                    String param = he.getName();
+                    String value = he.getValue();
+                    if (value != null && param != null &&
+                            param.equalsIgnoreCase("timeout")) {
+                        try {
+                            return Long.parseLong(value) * 1000;
+                        } catch (NumberFormatException ignore) {
+                        }
+                    }
+                }
+                return keepAliveOnBadKeepAliveValueMs;
+            }
+        };
+    }
+
+    private static class CustomRedirectStrategy extends LaxRedirectStrategy {
+
+        private static final Logger LOG = 
LoggerFactory.getLogger(CustomRedirectStrategy.class);
+        private Set<String> allowedHosts;
+
+        public CustomRedirectStrategy(Set<String> allowedHosts) {
+            this.allowedHosts = allowedHosts;
+        }
+
+        @Override
+        protected URI createLocationURI(final String location) throws 
ProtocolException {
+            String newLocation = location;
+            try {
+                new URI(newLocation);
+            } catch (final URISyntaxException ex) {
+                LOG.warn("Redirected URL: [ " + newLocation + " ] will be 
encoded");
+                try {
+                    newLocation = URLEncoder.encode(newLocation, 
StandardCharsets.UTF_8.name());
+                } catch (UnsupportedEncodingException e) {
+                    LOG.warn("Well, that didn't work out... :(");
+                }
+            }
+            return super.createLocationURI(newLocation);
+        }
+
+        @Override
+        public boolean isRedirected(HttpRequest request, HttpResponse 
response, HttpContext context) throws ProtocolException {
+            boolean isRedirectedSuper = super.isRedirected(request, response, 
context);
+            if (isRedirectedSuper) {
+                Header locationHeader = response.getFirstHeader("Location");
+                String location = locationHeader.getValue();
+                if (StringUtils.isBlank(location)) {
+                    return false;
+                }
+                URI uri;
+                try {
+                    uri = new URI(location);
+                } catch (URISyntaxException e) {
+                    return true;
+                }
+                if (!allowedHosts.isEmpty() && 
!allowedHosts.contains(uri.getHost())) {
+                    LOG.info("Not allowing external redirect. OriginalUrl={}," 
+
+                            " RedirectLocation={}", 
request.getRequestLine().getUri(), location);
+                    return false;
+                }
+            }
+            return isRedirectedSuper;
+        }
+    }
+
+    private class AES {
+        private final SecretKeySpec secretKey;
+        private byte[] key;
+
+        private AES() throws TikaConfigException {
+            secretKey = setKey(System.getenv("AES_KEY"));
+        }
+
+        private SecretKeySpec setKey(String myKey) throws TikaConfigException {
+            MessageDigest sha = null;
+            try {
+                key = myKey.getBytes(StandardCharsets.UTF_8);
+                sha = MessageDigest.getInstance("SHA-1");
+                key = sha.digest(key);
+                key = Arrays.copyOf(key, 16);
+                return new SecretKeySpec(key, "AES");
+            } catch (NoSuchAlgorithmException e) {
+                throw new TikaConfigException("bad key", e);
+            }
+        }
+
+        public String encrypt(String strToEncrypt) throws TikaConfigException {
+            try {
+                Cipher cipher = Cipher.getInstance("AES/ECB/PKCS5Padding");
+                cipher.init(Cipher.ENCRYPT_MODE, secretKey);
+                return Base64.getEncoder()
+                        
.encodeToString(cipher.doFinal(strToEncrypt.getBytes(StandardCharsets.UTF_8)));
+            } catch 
(NoSuchAlgorithmException|InvalidKeyException|NoSuchPaddingException|BadPaddingException|IllegalBlockSizeException
 e) {
+                throw new TikaConfigException("bad encryption info", e);
+            }
+        }
+
+        public String decrypt(String strToDecrypt) throws TikaConfigException {
+            try {
+                Cipher cipher = Cipher.getInstance("AES/ECB/PKCS5PADDING");
+                cipher.init(Cipher.DECRYPT_MODE, secretKey);
+                return new 
String(cipher.doFinal(Base64.getDecoder().decode(strToDecrypt)),
+                        StandardCharsets.UTF_8);
+            } catch (NoSuchAlgorithmException|
+                    InvalidKeyException|
+                    NoSuchPaddingException|
+                    BadPaddingException|
+                    IllegalBlockSizeException e) {
+                throw new TikaConfigException("bad encryption info", e);
+            }
+        }
+    }
+}
diff --git 
a/tika-pipes/tika-httpclient-commons/src/main/java/org/apache/tika/client/HttpClientUtil.java
 
b/tika-pipes/tika-httpclient-commons/src/main/java/org/apache/tika/client/HttpClientUtil.java
index 2e6fef7..780179c 100644
--- 
a/tika-pipes/tika-httpclient-commons/src/main/java/org/apache/tika/client/HttpClientUtil.java
+++ 
b/tika-pipes/tika-httpclient-commons/src/main/java/org/apache/tika/client/HttpClientUtil.java
@@ -19,9 +19,7 @@ package org.apache.tika.client;
 import org.apache.http.HttpResponse;
 import org.apache.http.client.HttpClient;
 import org.apache.http.client.methods.HttpPost;
-import org.apache.http.entity.BasicHttpEntity;
 import org.apache.http.entity.ByteArrayEntity;
-import org.apache.http.impl.client.HttpClients;
 import org.apache.http.util.EntityUtils;
 
 import java.io.IOException;
@@ -29,15 +27,13 @@ import java.nio.charset.StandardCharsets;
 
 public class HttpClientUtil {
 
-    private static HttpClient CLIENT = HttpClients.createDefault();
-
-    public static boolean postJson(String url, String json) throws IOException,
+    public static boolean postJson(HttpClient client, String url, String json) 
throws IOException,
             TikaClientException {
         HttpPost post = new HttpPost(url);
         ByteArrayEntity entity = new 
ByteArrayEntity(json.getBytes(StandardCharsets.UTF_8));
         post.setEntity(entity);
         post.setHeader("Content-Type", "application/json");
-        HttpResponse response = CLIENT.execute(post);
+        HttpResponse response = client.execute(post);
 
 
         if (response.getStatusLine().getStatusCode() != 200) {
@@ -51,4 +47,5 @@ public class HttpClientUtil {
         }
         return true;
     }
+
 }
diff --git a/tika-server/tika-server-client/pom.xml 
b/tika-server/tika-server-client/pom.xml
index 23e304f..d3daf68 100644
--- a/tika-server/tika-server-client/pom.xml
+++ b/tika-server/tika-server-client/pom.xml
@@ -58,4 +58,72 @@
         </dependency>
     </dependencies>
 
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-jar-plugin</artifactId>
+                <configuration>
+                    <archive>
+                        <manifestEntries>
+                            
<Automatic-Module-Name>org.apache.tika.server.client</Automatic-Module-Name>
+                        </manifestEntries>
+                    </archive>
+                </configuration>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>test-jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-shade-plugin</artifactId>
+                <version>${maven.shade.version}</version>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <createDependencyReducedPom>
+                                false
+                            </createDependencyReducedPom>
+                            <!-- <filters> -->
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/*</exclude>
+                                        <exclude>LICENSE.txt</exclude>
+                                        <exclude>NOTICE.txt</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                            <transformers>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                                    
<mainClass>org.apache.tika.server.client.TikaClientCLI</mainClass>
+                                </transformer>
+
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/LICENSE</resource>
+                                    
<file>target/classes/META-INF/LICENSE</file>
+                                </transformer>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/NOTICE</resource>
+                                    <file>target/classes/META-INF/NOTICE</file>
+                                </transformer>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/DEPENDENCIES</resource>
+                                    
<file>target/classes/META-INF/DEPENDENCIES</file>
+                                </transformer>
+                            </transformers>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
 </project>
\ No newline at end of file
diff --git 
a/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaClientConfig.java
 
b/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaClientConfig.java
deleted file mode 100644
index 7034c89..0000000
--- 
a/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaClientConfig.java
+++ /dev/null
@@ -1,85 +0,0 @@
-package org.apache.tika.server.client;
-
-import org.apache.tika.config.Param;
-import org.apache.tika.config.ServiceLoader;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.mime.MimeTypeException;
-import org.apache.tika.mime.MimeTypes;
-import org.apache.tika.pipes.fetchiterator.EmptyFetchIterator;
-import org.apache.tika.pipes.fetchiterator.FetchIterator;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.xml.sax.SAXException;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.lang.reflect.InvocationTargetException;
-import java.net.URL;
-import java.nio.file.Path;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-public class TikaClientConfig extends TikaConfig {
-    public TikaClientConfig(String file) throws TikaException, IOException, 
SAXException {
-        super(file);
-    }
-
-    public TikaClientConfig(Path path) throws TikaException, IOException, 
SAXException {
-        super(path);
-    }
-
-    public TikaClientConfig(Path path, ServiceLoader loader) throws 
TikaException, IOException, SAXException {
-        super(path, loader);
-    }
-
-    public TikaClientConfig(File file) throws TikaException, IOException, 
SAXException {
-        super(file);
-    }
-
-    public TikaClientConfig(File file, ServiceLoader loader) throws 
TikaException, IOException, SAXException {
-        super(file, loader);
-    }
-
-    public TikaClientConfig(URL url) throws TikaException, IOException, 
SAXException {
-        super(url);
-    }
-
-    public TikaClientConfig(URL url, ClassLoader loader) throws TikaException, 
IOException, SAXException {
-        super(url, loader);
-    }
-
-    public TikaClientConfig(URL url, ServiceLoader loader) throws 
TikaException, IOException, SAXException {
-        super(url, loader);
-    }
-
-    public TikaClientConfig(InputStream stream) throws TikaException, 
IOException, SAXException {
-        super(stream);
-    }
-
-    public TikaClientConfig(Document document) throws TikaException, 
IOException {
-        super(document);
-    }
-
-    public TikaClientConfig(Document document, ServiceLoader loader) throws 
TikaException, IOException {
-        super(document, loader);
-    }
-
-    public TikaClientConfig(Element element) throws TikaException, IOException 
{
-        super(element);
-    }
-
-    public TikaClientConfig(Element element, ClassLoader loader) throws 
TikaException, IOException {
-        super(element, loader);
-    }
-
-    public TikaClientConfig(ClassLoader loader) throws MimeTypeException, 
IOException {
-        super(loader);
-    }
-
-    public TikaClientConfig() throws TikaException, IOException {
-    }
-
-}
diff --git 
a/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaClientConfigException.java
 
b/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaClientConfigException.java
index 2144619..97776b8 100644
--- 
a/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaClientConfigException.java
+++ 
b/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaClientConfigException.java
@@ -19,6 +19,7 @@ package org.apache.tika.server.client;
 import org.apache.tika.exception.TikaException;
 
 public class TikaClientConfigException extends TikaException {
+
     public TikaClientConfigException(String msg) {
         super(msg);
     }

Reply via email to