This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new eeccee7 TIKA-3287 -- many thanks to Nick DiPiazza!
eeccee7 is described below
commit eeccee7593423853725992106cf7b574cb5a6de8
Author: tballison <[email protected]>
AuthorDate: Fri Jan 29 14:22:09 2021 -0500
TIKA-3287 -- many thanks to Nick DiPiazza!
---
.../tika/pipes/fetcher/SimpleUrlFetcher.java | 72 ----
.../apache/tika/pipes/fetcher/UrlFetcherTest.java | 51 ---
tika-pipes/tika-emitters/tika-emitter-fs/pom.xml | 65 +++
.../tika/pipes/emitter/fs/FileSystemEmitter.java | 43 +-
tika-pipes/tika-emitters/tika-emitter-s3/pom.xml | 66 ++++
tika-pipes/tika-emitters/tika-emitter-solr/pom.xml | 65 +++
.../tika/pipes/emitter/solr/SolrEmitter.java | 36 +-
.../tika-fetch-iterator-csv/pom.xml | 61 +++
.../tika-fetch-iterator-jdbc/pom.xml | 65 +++
.../tika-fetch-iterator-s3/pom.xml | 65 +++
tika-pipes/tika-fetchers/pom.xml | 1 +
.../{tika-fetcher-s3 => tika-fetcher-http}/pom.xml | 45 +--
.../tika/pipes/fetcher/http/HttpFetcher.java | 155 ++++++++
.../tika/pipes/fetcher/http/HttpFetcherTest.java | 63 +++
.../src/test/resources/tika-config-http.xml | 26 ++
tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml | 5 -
.../org/apache/tika/client/HttpClientFactory.java | 436 +++++++++++++++++++++
.../org/apache/tika/client/HttpClientUtil.java | 9 +-
tika-server/tika-server-client/pom.xml | 68 ++++
.../tika/server/client/TikaClientConfig.java | 85 ----
.../server/client/TikaClientConfigException.java | 1 +
21 files changed, 1222 insertions(+), 261 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/SimpleUrlFetcher.java
b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/SimpleUrlFetcher.java
deleted file mode 100644
index 276d2f4..0000000
---
a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/SimpleUrlFetcher.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.pipes.fetcher;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.HttpHeaders;
-import org.apache.tika.metadata.Metadata;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.net.URL;
-import java.net.URLConnection;
-import java.util.Collections;
-import java.util.Set;
-
-/**
- * This is a lightweight fetcher that uses Java's
- * {@link URL#openStream()}. Please consider a more
- * robust way to fetch URLs, e.g. Apache httpcomponents,
- * curl or wget...
- *
- * This is limited to http: and https: urls. This does
- * not support the file:/// protocol. See {@link FileSystemFetcher}.
- */
-public class SimpleUrlFetcher extends AbstractFetcher {
-
- private static String NAME = "url";
-
- public SimpleUrlFetcher() {
- super(NAME);
- }
-
-
- @Override
- public InputStream fetch(String fetchKey, Metadata metadata)
- throws IOException, TikaException {
- URL url = new URL(fetchKey);
- if (! url.getProtocol().equals("http") &&
- ! url.getProtocol().equals("https") &&
- ! url.getProtocol().equals("ftp")) {
- throw new TikaException("This fetcher only handles: http, https;
NOT: "
- + url.getProtocol());
- }
- return TikaInputStream.get(url, metadata);
- }
-
- public InputStream fetch(String fetchKey, long startRange, long endRange,
Metadata metadata)
- throws IOException, TikaException {
- URL url = new URL(fetchKey);
- URLConnection connection = url.openConnection();
- connection.setRequestProperty("Range",
"bytes="+startRange+"-"+endRange);
- metadata.set(HttpHeaders.CONTENT_LENGTH,
Long.toString(endRange-startRange+1));
- TikaInputStream tis = TikaInputStream.get(connection.getInputStream());
- tis.getPath();
- return tis;
- }
-}
diff --git
a/tika-core/src/test/java/org/apache/tika/pipes/fetcher/UrlFetcherTest.java
b/tika-core/src/test/java/org/apache/tika/pipes/fetcher/UrlFetcherTest.java
deleted file mode 100644
index 7fb96bf..0000000
--- a/tika-core/src/test/java/org/apache/tika/pipes/fetcher/UrlFetcherTest.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.pipes.fetcher;
-
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.metadata.Metadata;
-import org.junit.Ignore;
-import org.junit.Test;
-
-import java.io.InputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.StandardCopyOption;
-import java.util.zip.GZIPInputStream;
-
-import static org.junit.Assert.assertEquals;
-
-@Ignore("requires network connectivity")
-public class UrlFetcherTest {
-
- @Test
- public void testRange() throws Exception {
- String url =
-
"https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2020-45/segments/1603107869785.9/warc/CC-MAIN-20201020021700-20201020051700-00529.warc.gz";
- long start = 969596307;
- long end = start + 1408 - 1;
- Metadata metadata = new Metadata();
-
- try (TemporaryResources tmp = new TemporaryResources()) {
- Path tmpPath = tmp.createTempFile();
- try (InputStream is = new SimpleUrlFetcher().fetch(url, start,
end, metadata)) {
- Files.copy(new GZIPInputStream(is), tmpPath,
StandardCopyOption.REPLACE_EXISTING);
- }
- assertEquals(2461, Files.size(tmpPath));
- }
- }
-}
diff --git a/tika-pipes/tika-emitters/tika-emitter-fs/pom.xml
b/tika-pipes/tika-emitters/tika-emitter-fs/pom.xml
index 6c5c2cf..cf214da 100644
--- a/tika-pipes/tika-emitters/tika-emitter-fs/pom.xml
+++ b/tika-pipes/tika-emitters/tika-emitter-fs/pom.xml
@@ -45,4 +45,69 @@
</dependency>
</dependencies>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+
<Automatic-Module-Name>org.apache.tika.pipes.emitter.fs</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <artifactId>maven-shade-plugin</artifactId>
+ <version>${maven.shade.version}</version>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>shade</goal>
+ </goals>
+ <configuration>
+ <createDependencyReducedPom>
+ false
+ </createDependencyReducedPom>
+ <!-- <filters> -->
+ <filters>
+ <filter>
+ <artifact>*:*</artifact>
+ <excludes>
+ <exclude>META-INF/*</exclude>
+ <exclude>LICENSE.txt</exclude>
+ <exclude>NOTICE.txt</exclude>
+ </excludes>
+ </filter>
+ </filters>
+ <transformers>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/LICENSE</resource>
+
<file>target/classes/META-INF/LICENSE</file>
+ </transformer>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/NOTICE</resource>
+ <file>target/classes/META-INF/NOTICE</file>
+ </transformer>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/DEPENDENCIES</resource>
+
<file>target/classes/META-INF/DEPENDENCIES</file>
+ </transformer>
+ </transformers>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+
+ </plugins>
+ </build>
</project>
\ No newline at end of file
diff --git
a/tika-pipes/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
b/tika-pipes/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
index 9398096..f58a21c 100644
---
a/tika-pipes/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
+++
b/tika-pipes/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
@@ -30,9 +30,11 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
+import java.nio.file.FileAlreadyExistsException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
import java.util.Collections;
import java.util.List;
import java.util.Set;
@@ -54,6 +56,9 @@ import java.util.Set;
* <param name="basePath"
type="string">/path/to/output</param>
* <!-- optional; default is 'json' -->
* <param name="fileExtension"
type="string">json</param>
+ * <!-- optional; if the file already exists, options
('skip', 'replace', 'exception')
+ * default is 'exception' -->
+ * <param name="onExists"
type="string">skip</param>
* </params>
* </emitter>
* </emitters>
@@ -61,8 +66,13 @@ import java.util.Set;
*/
public class FileSystemEmitter extends AbstractEmitter implements
StreamEmitter {
+ enum ON_EXISTS {
+ SKIP, EXCEPTION, REPLACE
+ }
+
private Path basePath = null;
private String fileExtension = "json";
+ private ON_EXISTS onExists = ON_EXISTS.EXCEPTION;
@Override
@@ -108,9 +118,40 @@ public class FileSystemEmitter extends AbstractEmitter
implements StreamEmitter
this.fileExtension = fileExtension;
}
+ @Field
+ public void setOnExists(String onExists) {
+ if (onExists.equals("skip")) {
+ this.onExists = ON_EXISTS.SKIP;
+ } else if (onExists.equals("replace")) {
+ this.onExists = ON_EXISTS.REPLACE;
+ } else if (onExists.equals("exception")) {
+ this.onExists = ON_EXISTS.EXCEPTION;
+ } else {
+ throw new IllegalArgumentException(
+ "Don't understand '" + onExists +
+ "'; must be one of: 'skip', 'replace',
'exception'");
+ }
+ }
@Override
public void emit(String path, InputStream inputStream, Metadata
userMetadata) throws IOException,
TikaEmitterException {
- Files.copy(inputStream, basePath.resolve(path));
+ Path target = basePath.resolve(path);
+
+ if (!Files.isDirectory(target.getParent())) {
+ Files.createDirectories(target.getParent());
+ }
+ if (onExists == ON_EXISTS.REPLACE) {
+ Files.copy(inputStream, target,
StandardCopyOption.REPLACE_EXISTING);
+ } else if (onExists == ON_EXISTS.EXCEPTION) {
+ Files.copy(inputStream, target);
+ } else if (onExists == ON_EXISTS.SKIP) {
+ if (!Files.isRegularFile(target)) {
+ try {
+ Files.copy(inputStream, target);
+ } catch (FileAlreadyExistsException e) {
+ //swallow
+ }
+ }
+ }
}
}
diff --git a/tika-pipes/tika-emitters/tika-emitter-s3/pom.xml
b/tika-pipes/tika-emitters/tika-emitter-s3/pom.xml
index 61f8c77..62a79f0 100644
--- a/tika-pipes/tika-emitters/tika-emitter-s3/pom.xml
+++ b/tika-pipes/tika-emitters/tika-emitter-s3/pom.xml
@@ -82,4 +82,70 @@
<scope>test</scope>
</dependency>
</dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+
<Automatic-Module-Name>org.apache.tika.pipes.emitter.s3</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <artifactId>maven-shade-plugin</artifactId>
+ <version>${maven.shade.version}</version>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>shade</goal>
+ </goals>
+ <configuration>
+ <createDependencyReducedPom>
+ false
+ </createDependencyReducedPom>
+ <!-- <filters> -->
+ <filters>
+ <filter>
+ <artifact>*:*</artifact>
+ <excludes>
+ <exclude>META-INF/*</exclude>
+ <exclude>LICENSE.txt</exclude>
+ <exclude>NOTICE.txt</exclude>
+ </excludes>
+ </filter>
+ </filters>
+ <transformers>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/LICENSE</resource>
+
<file>target/classes/META-INF/LICENSE</file>
+ </transformer>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/NOTICE</resource>
+ <file>target/classes/META-INF/NOTICE</file>
+ </transformer>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/DEPENDENCIES</resource>
+
<file>target/classes/META-INF/DEPENDENCIES</file>
+ </transformer>
+ </transformers>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+
+ </plugins>
+ </build>
</project>
\ No newline at end of file
diff --git a/tika-pipes/tika-emitters/tika-emitter-solr/pom.xml
b/tika-pipes/tika-emitters/tika-emitter-solr/pom.xml
index 6aefb89..1edfff0 100644
--- a/tika-pipes/tika-emitters/tika-emitter-solr/pom.xml
+++ b/tika-pipes/tika-emitters/tika-emitter-solr/pom.xml
@@ -52,4 +52,69 @@
</dependency>
</dependencies>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+
<Automatic-Module-Name>org.apache.tika.pipes.emitter.solr</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <artifactId>maven-shade-plugin</artifactId>
+ <version>${maven.shade.version}</version>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>shade</goal>
+ </goals>
+ <configuration>
+ <createDependencyReducedPom>
+ false
+ </createDependencyReducedPom>
+ <!-- <filters> -->
+ <filters>
+ <filter>
+ <artifact>*:*</artifact>
+ <excludes>
+ <exclude>META-INF/*</exclude>
+ <exclude>LICENSE.txt</exclude>
+ <exclude>NOTICE.txt</exclude>
+ </excludes>
+ </filter>
+ </filters>
+ <transformers>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/LICENSE</resource>
+
<file>target/classes/META-INF/LICENSE</file>
+ </transformer>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/NOTICE</resource>
+ <file>target/classes/META-INF/NOTICE</file>
+ </transformer>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/DEPENDENCIES</resource>
+
<file>target/classes/META-INF/DEPENDENCIES</file>
+ </transformer>
+ </transformers>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+
+ </plugins>
+ </build>
</project>
\ No newline at end of file
diff --git
a/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
b/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
index 2391424..5830b14 100644
---
a/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
+++
b/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
@@ -19,6 +19,8 @@ package org.apache.tika.pipes.emitter.solr;
import com.google.gson.Gson;
import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
+import org.apache.http.client.HttpClient;
+import org.apache.tika.client.HttpClientFactory;
import org.apache.tika.client.HttpClientUtil;
import org.apache.tika.client.TikaClientException;
import org.apache.tika.config.Field;
@@ -56,6 +58,8 @@ public class SolrEmitter extends AbstractEmitter implements
Initializable {
private String contentField = "content";
private String idField = "id";
private int commitWithin = 100;
+ private HttpClientFactory httpClientFactory;
+ private HttpClient httpClient;
@Override
public void emit(String emitKey, List<Metadata> metadataList) throws
IOException,
@@ -67,7 +71,8 @@ public class SolrEmitter extends AbstractEmitter implements
Initializable {
String json = jsonify(emitKey, metadataList);
LOG.debug("emitting json:"+json);
try {
-
HttpClientUtil.postJson(url+UPDATE_PATH+"?commitWithin="+getCommitWithin(),
json);
+ HttpClientUtil.postJson(httpClient,
+ url+UPDATE_PATH+"?commitWithin="+getCommitWithin(), json);
} catch (TikaClientException e) {
throw new TikaEmitterException("can't post", e);
}
@@ -200,7 +205,6 @@ public class SolrEmitter extends AbstractEmitter implements
Initializable {
public int getCommitWithin() {
return commitWithin;
}
- //TODO: add username/password for authentication?
/**
* Specify the field in the first Metadata that should be
@@ -213,10 +217,36 @@ public class SolrEmitter extends AbstractEmitter
implements Initializable {
this.idField = idField;
}
+ //TODO -- add other httpclient configurations
+ @Field
+ public void setUserName(String userName) {
+ httpClientFactory.setUserName(userName);
+ }
+
+ @Field
+ public void setPassword(String password) {
+ httpClientFactory.setPassword(password);
+ }
+
+ @Field
+ public void setAuthScheme(String authScheme) {
+ httpClientFactory.setAuthScheme(authScheme);
+ }
+
+ @Field
+ public void setProxyHost(String proxyHost) {
+ httpClientFactory.setProxyHost(proxyHost);
+ }
+
+ @Field
+ public void setProxyPort(int proxyPort) {
+ httpClientFactory.setProxyPort(proxyPort);
+ }
+
@Override
public void initialize(Map<String, Param> params) throws
TikaConfigException {
//TODO: build the client here?
-
+ httpClient = httpClientFactory.build();
}
@Override
diff --git a/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-csv/pom.xml
b/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-csv/pom.xml
index 40906df..6996366 100644
--- a/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-csv/pom.xml
+++ b/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-csv/pom.xml
@@ -63,6 +63,67 @@
</excludes>
</configuration>
</plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+
<Automatic-Module-Name>org.apache.tika.pipes.fetchiterator.csv</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <artifactId>maven-shade-plugin</artifactId>
+ <version>${maven.shade.version}</version>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>shade</goal>
+ </goals>
+ <configuration>
+ <createDependencyReducedPom>
+ false
+ </createDependencyReducedPom>
+ <!-- <filters> -->
+ <filters>
+ <filter>
+ <artifact>*:*</artifact>
+ <excludes>
+ <exclude>META-INF/*</exclude>
+ <exclude>LICENSE.txt</exclude>
+ <exclude>NOTICE.txt</exclude>
+ </excludes>
+ </filter>
+ </filters>
+ <transformers>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/LICENSE</resource>
+
<file>target/classes/META-INF/LICENSE</file>
+ </transformer>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/NOTICE</resource>
+ <file>target/classes/META-INF/NOTICE</file>
+ </transformer>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/DEPENDENCIES</resource>
+
<file>target/classes/META-INF/DEPENDENCIES</file>
+ </transformer>
+ </transformers>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+
</plugins>
</build>
</project>
diff --git a/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-jdbc/pom.xml
b/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-jdbc/pom.xml
index 14cd6f3..2813ed4 100644
--- a/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-jdbc/pom.xml
+++ b/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-jdbc/pom.xml
@@ -52,4 +52,69 @@
<scope>test</scope>
</dependency>
</dependencies>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+
<Automatic-Module-Name>org.apache.tika.pipes.fetchiterator.jdbc</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <artifactId>maven-shade-plugin</artifactId>
+ <version>${maven.shade.version}</version>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>shade</goal>
+ </goals>
+ <configuration>
+ <createDependencyReducedPom>
+ false
+ </createDependencyReducedPom>
+ <!-- <filters> -->
+ <filters>
+ <filter>
+ <artifact>*:*</artifact>
+ <excludes>
+ <exclude>META-INF/*</exclude>
+ <exclude>LICENSE.txt</exclude>
+ <exclude>NOTICE.txt</exclude>
+ </excludes>
+ </filter>
+ </filters>
+ <transformers>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/LICENSE</resource>
+
<file>target/classes/META-INF/LICENSE</file>
+ </transformer>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/NOTICE</resource>
+ <file>target/classes/META-INF/NOTICE</file>
+ </transformer>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/DEPENDENCIES</resource>
+
<file>target/classes/META-INF/DEPENDENCIES</file>
+ </transformer>
+ </transformers>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+
+ </plugins>
+ </build>
</project>
diff --git a/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-s3/pom.xml
b/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-s3/pom.xml
index 85ee53c..f8b2424 100644
--- a/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-s3/pom.xml
+++ b/tika-pipes/tika-fetch-iterators/tika-fetch-iterator-s3/pom.xml
@@ -79,4 +79,69 @@
<scope>test</scope>
</dependency>
</dependencies>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+
<Automatic-Module-Name>org.apache.tika.pipes.fetchiterator.s3</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <artifactId>maven-shade-plugin</artifactId>
+ <version>${maven.shade.version}</version>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>shade</goal>
+ </goals>
+ <configuration>
+ <createDependencyReducedPom>
+ false
+ </createDependencyReducedPom>
+ <!-- <filters> -->
+ <filters>
+ <filter>
+ <artifact>*:*</artifact>
+ <excludes>
+ <exclude>META-INF/*</exclude>
+ <exclude>LICENSE.txt</exclude>
+ <exclude>NOTICE.txt</exclude>
+ </excludes>
+ </filter>
+ </filters>
+ <transformers>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/LICENSE</resource>
+
<file>target/classes/META-INF/LICENSE</file>
+ </transformer>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/NOTICE</resource>
+ <file>target/classes/META-INF/NOTICE</file>
+ </transformer>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/DEPENDENCIES</resource>
+
<file>target/classes/META-INF/DEPENDENCIES</file>
+ </transformer>
+ </transformers>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+
+ </plugins>
+ </build>
</project>
diff --git a/tika-pipes/tika-fetchers/pom.xml b/tika-pipes/tika-fetchers/pom.xml
index 597c76b..d541a49 100644
--- a/tika-pipes/tika-fetchers/pom.xml
+++ b/tika-pipes/tika-fetchers/pom.xml
@@ -34,6 +34,7 @@
<url>http://tika.apache.org/</url>
<modules>
+ <module>tika-fetcher-http</module>
<module>tika-fetcher-s3</module>
</modules>
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml
b/tika-pipes/tika-fetchers/tika-fetcher-http/pom.xml
similarity index 74%
copy from tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml
copy to tika-pipes/tika-fetchers/tika-fetcher-http/pom.xml
index 55afa8d..09dcb97 100644
--- a/tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml
+++ b/tika-pipes/tika-fetchers/tika-fetcher-http/pom.xml
@@ -27,40 +27,10 @@
</parent>
<modelVersion>4.0.0</modelVersion>
- <artifactId>tika-fetcher-s3</artifactId>
+ <artifactId>tika-fetcher-http</artifactId>
<dependencies>
<dependency>
- <groupId>com.amazonaws</groupId>
- <artifactId>aws-java-sdk-s3</artifactId>
- <version>${aws.version}</version>
- <exclusions>
- <exclusion>
- <groupId>commons-logging</groupId>
- <artifactId>commons-logging</artifactId>
- </exclusion>
- <exclusion>
- <groupId>com.fasterxml.jackson.core</groupId>
- <artifactId>jackson-core</artifactId>
- </exclusion>
- <exclusion>
- <groupId>com.fasterxml.jackson.core</groupId>
- <artifactId>jackson-databind</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>com.fasterxml.jackson.core</groupId>
- <artifactId>jackson-databind</artifactId>
- <version>${jackson.version}</version>
- </dependency>
- <dependency>
- <groupId>commons-logging</groupId>
- <artifactId>commons-logging</artifactId>
- <version>${commons.logging.version}</version>
- </dependency>
- <dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</dependency>
@@ -71,6 +41,11 @@
<scope>provided</scope>
</dependency>
<dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-httpclient-commons</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
@@ -85,7 +60,7 @@
<configuration>
<archive>
<manifestEntries>
-
<Automatic-Module-Name>org.apache.tika.pipes.fetcher.s3</Automatic-Module-Name>
+
<Automatic-Module-Name>org.apache.tika.pipes.fetcher.http</Automatic-Module-Name>
</manifestEntries>
</archive>
</configuration>
@@ -122,11 +97,6 @@
</filter>
</filters>
<transformers>
- <transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
-
<mainClass>org.apache.tika.eval.app.TikaEvalCLI</mainClass>
- </transformer>
-
- <transformer
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"
/>
<transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
<resource>META-INF/LICENSE</resource>
<file>target/classes/META-INF/LICENSE</file>
@@ -144,7 +114,6 @@
</execution>
</executions>
</plugin>
-
</plugins>
</build>
</project>
\ No newline at end of file
diff --git
a/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java
b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java
new file mode 100644
index 0000000..75bd1a1
--- /dev/null
+++
b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.fetcher.http;
+
+
+import org.apache.commons.io.IOUtils;
+import org.apache.http.HttpResponse;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.tika.client.HttpClientFactory;
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.pipes.fetcher.AbstractFetcher;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Map;
+
+/**
+ * Based on Apache httpclient
+ */
+public class HttpFetcher extends AbstractFetcher implements Initializable {
+
+ Logger LOG = LoggerFactory.getLogger(HttpFetcher.class);
+ private HttpClientFactory httpClientFactory;
+ private HttpClient httpClient;
+
+ public HttpFetcher() {
+ httpClientFactory = new HttpClientFactory();
+ }
+ @Override
+ public InputStream fetch(String fetchKey, Metadata metadata)
+ throws IOException, TikaException {
+ HttpGet get = new HttpGet(fetchKey);
+ return get(get);
+ }
+
+ public InputStream fetch(String fetchKey, long startRange, long endRange,
Metadata metadata)
+ throws IOException, TikaException {
+ HttpGet get = new HttpGet(fetchKey);
+ get.setHeader("Range", "bytes="+startRange+"-"+endRange);
+ return get(get);
+ }
+
+ private InputStream get(HttpGet get) throws IOException, TikaException {
+ HttpResponse response = httpClient.execute(get);
+ int code = response.getStatusLine().getStatusCode();
+ if (code < 200 || code > 299) {
+ throw new IOException("bad status code: "+
+ code
+ + " :: " +
+ responseToString(response.getEntity().getContent()));
+ }
+
+ //spool to local
+ long start = System.currentTimeMillis();
+ TikaInputStream tis = TikaInputStream.get(
+ response.getEntity().getContent());
+ tis.getPath();
+ if (response instanceof CloseableHttpResponse) {
+ ((CloseableHttpResponse) response).close();
+ }
+ long elapsed = System.currentTimeMillis()-start;
+ LOG.debug("took {} ms to copy to local tmp file", elapsed);
+ return tis;
+ }
+
+ private String responseToString(InputStream is) {
+ try {
+ return IOUtils.toString(is, StandardCharsets.UTF_8);
+ } catch (IOException e) {
+ LOG.warn("IOexception trying to read error message", e);
+ return "";
+ }
+ }
+
+ @Field
+ public void setUserName(String userName) {
+ httpClientFactory.setUserName(userName);
+ }
+
+ @Field
+ public void setPassword(String password) {
+ httpClientFactory.setPassword(password);
+ }
+
+ @Field
+ public void setNtDomain(String domain) {
+ httpClientFactory.setNtDomain(domain);
+ }
+
+ @Field
+ public void setAuthScheme(String authScheme) {
+ httpClientFactory.setAuthScheme(authScheme);
+ }
+
+ @Field
+ public void setProxyHost(String proxyHost) {
+ httpClientFactory.setProxyHost(proxyHost);
+ }
+
+ @Field
+ public void setProxyPort(int proxyPort) {
+ httpClientFactory.setProxyPort(proxyPort);
+ }
+
+ @Field
+ public void setConnectTimeout(int connectTimeout) {
+ httpClientFactory.setConnectTimeout(connectTimeout);
+ }
+
+ @Field
+ public void setRequestTimeout(int requestTimeout) {
+ httpClientFactory.setRequestTimeout(requestTimeout);
+ }
+
+ @Field
+ public void setSocketTimeout(int socketTimeout) {
+ httpClientFactory.setSocketTimeout(socketTimeout);
+ }
+
+ @Override
+ public void initialize(Map<String, Param> params) throws
TikaConfigException {
+ httpClient = httpClientFactory.build();
+ }
+
+ @Override
+ public void checkInitialization(InitializableProblemHandler
problemHandler) throws TikaConfigException {
+
+ }
+}
diff --git
a/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/HttpFetcherTest.java
b/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/HttpFetcherTest.java
new file mode 100644
index 0000000..8a2d8a1
--- /dev/null
+++
b/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/HttpFetcherTest.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.fetcher.http;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+import java.util.zip.GZIPInputStream;
+
+import static org.junit.Assert.assertEquals;
+
+@Ignore("requires network connectivity")
+public class HttpFetcherTest {
+
+ @Test
+ public void testRange() throws Exception {
+ String url =
+
"https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2020-45/segments/1603107869785.9/warc/CC-MAIN-20201020021700-20201020051700-00529.warc.gz";
+ long start = 969596307;
+ long end = start + 1408 - 1;
+ Metadata metadata = new Metadata();
+ HttpFetcher httpFetcher = (HttpFetcher)
getConfig("tika-config-http.xml")
+ .getFetcherManager().getFetcher("http");
+ try (TemporaryResources tmp = new TemporaryResources()) {
+ Path tmpPath = tmp.createTempFile();
+ try (InputStream is = httpFetcher.fetch(url, start, end,
metadata)) {
+ Files.copy(new GZIPInputStream(is), tmpPath,
StandardCopyOption.REPLACE_EXISTING);
+ }
+ assertEquals(2461, Files.size(tmpPath));
+ }
+ }
+
+
+ TikaConfig getConfig(String path) throws TikaException, IOException,
SAXException {
+ return new
TikaConfig(HttpFetcherTest.class.getResourceAsStream("/"+path));
+ }
+
+
+}
diff --git
a/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/resources/tika-config-http.xml
b/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/resources/tika-config-http.xml
new file mode 100644
index 0000000..028f123
--- /dev/null
+++
b/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/resources/tika-config-http.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <fetchers>
+ <fetcher class="org.apache.tika.pipes.fetcher.http.HttpFetcher">
+ <params>
+ <param name="name" type="string">http</param>
+ </params>
+ </fetcher>
+ </fetchers>
+</properties>
\ No newline at end of file
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml
b/tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml
index 55afa8d..158e701 100644
--- a/tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml
+++ b/tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml
@@ -122,11 +122,6 @@
</filter>
</filters>
<transformers>
- <transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
-
<mainClass>org.apache.tika.eval.app.TikaEvalCLI</mainClass>
- </transformer>
-
- <transformer
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"
/>
<transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
<resource>META-INF/LICENSE</resource>
<file>target/classes/META-INF/LICENSE</file>
diff --git
a/tika-pipes/tika-httpclient-commons/src/main/java/org/apache/tika/client/HttpClientFactory.java
b/tika-pipes/tika-httpclient-commons/src/main/java/org/apache/tika/client/HttpClientFactory.java
new file mode 100644
index 0000000..e74dcb9
--- /dev/null
+++
b/tika-pipes/tika-httpclient-commons/src/main/java/org/apache/tika/client/HttpClientFactory.java
@@ -0,0 +1,436 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.client;
+
+import org.apache.http.Header;
+import org.apache.http.HeaderElement;
+import org.apache.http.HeaderElementIterator;
+import org.apache.http.HttpHost;
+import org.apache.http.HttpRequest;
+import org.apache.http.HttpResponse;
+import org.apache.http.ProtocolException;
+import org.apache.http.auth.AuthSchemeProvider;
+import org.apache.http.auth.AuthScope;
+import org.apache.http.auth.Credentials;
+import org.apache.http.auth.NTCredentials;
+import org.apache.http.auth.UsernamePasswordCredentials;
+import org.apache.http.client.CredentialsProvider;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.config.AuthSchemes;
+import org.apache.http.client.config.RequestConfig;
+import org.apache.http.config.Registry;
+import org.apache.http.config.RegistryBuilder;
+import org.apache.http.conn.ConnectionKeepAliveStrategy;
+import org.apache.http.conn.socket.ConnectionSocketFactory;
+import org.apache.http.conn.socket.PlainConnectionSocketFactory;
+import org.apache.http.conn.ssl.NoopHostnameVerifier;
+import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
+import org.apache.http.conn.ssl.TrustStrategy;
+import org.apache.http.impl.auth.BasicSchemeFactory;
+import org.apache.http.impl.auth.NTLMSchemeFactory;
+import org.apache.http.impl.client.BasicCredentialsProvider;
+import org.apache.http.impl.client.HttpClientBuilder;
+import org.apache.http.impl.client.HttpClients;
+import org.apache.http.impl.client.LaxRedirectStrategy;
+import org.apache.http.impl.conn.DefaultProxyRoutePlanner;
+import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
+import org.apache.http.message.BasicHeaderElementIterator;
+import org.apache.http.protocol.HTTP;
+import org.apache.http.protocol.HttpContext;
+import org.apache.http.ssl.SSLContexts;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.utils.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.crypto.BadPaddingException;
+import javax.crypto.Cipher;
+import javax.crypto.IllegalBlockSizeException;
+import javax.crypto.NoSuchPaddingException;
+import javax.crypto.spec.SecretKeySpec;
+import javax.net.ssl.SSLContext;
+import java.io.UnsupportedEncodingException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URLEncoder;
+import java.nio.charset.StandardCharsets;
+import java.security.InvalidKeyException;
+import java.security.KeyManagementException;
+import java.security.KeyStoreException;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.Arrays;
+import java.util.Base64;
+import java.util.HashSet;
+import java.util.Set;
+
+public class HttpClientFactory {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(HttpClientFactory.class);
+
+ private AES aes;
+
+ private String proxyHost;
+ private int proxyPort;
+ private Set<String> allowedHostsForRedirect = new HashSet<>();
+ private int maxConnectionsPerRoute = 1000;
+ private int maxConnections = 2000;
+ private int requestTimeout = 120000;
+ private int connectTimeout = 120000;
+ private int socketTimeout = 120000;
+ private int keepAliveOnBadKeepAliveValueMs = 1000;
+ private String userName;
+ private String password;
+ private String ntDomain;//if using nt credentials
+ private String authScheme = "basic"; //ntlm or basic
+ private boolean credentialsAESEncrypted = false;
+
+ public String getProxyHost() {
+ return proxyHost;
+ }
+
+ public void setProxyHost(String proxyHost) {
+ this.proxyHost = proxyHost;
+ }
+
+ public int getProxyPort() {
+ return proxyPort;
+ }
+
+ public void setProxyPort(int proxyPort) {
+ this.proxyPort = proxyPort;
+ }
+
+ public Set<String> getAllowedHostsForRedirect() {
+ return allowedHostsForRedirect;
+ }
+
+ public void setAllowedHostsForRedirect(Set<String>
allowedHostsForRedirect) {
+ this.allowedHostsForRedirect = allowedHostsForRedirect;
+ }
+
+ public int getMaxConnectionsPerRoute() {
+ return maxConnectionsPerRoute;
+ }
+
+ public void setMaxConnectionsPerRoute(int maxConnectionsPerRoute) {
+ this.maxConnectionsPerRoute = maxConnectionsPerRoute;
+ }
+
+ public int getMaxConnections() {
+ return maxConnections;
+ }
+
+ public void setMaxConnections(int maxConnections) {
+ this.maxConnections = maxConnections;
+ }
+
+ public int getRequestTimeout() {
+ return requestTimeout;
+ }
+
+ public void setRequestTimeout(int requestTimeout) {
+ this.requestTimeout = requestTimeout;
+ }
+
+ public int getConnectTimeout() {
+ return connectTimeout;
+ }
+
+ public void setConnectTimeout(int connectTimeout) {
+ this.connectTimeout = connectTimeout;
+ }
+
+ public int getSocketTimeout() {
+ return socketTimeout;
+ }
+
+ public void setSocketTimeout(int socketTimeout) {
+ this.socketTimeout = socketTimeout;
+ }
+
+ public int getKeepAliveOnBadKeepAliveValueMs() {
+ return keepAliveOnBadKeepAliveValueMs;
+ }
+
+ public void setKeepAliveOnBadKeepAliveValueMs(int
keepAliveOnBadKeepAliveValueMs) {
+ this.keepAliveOnBadKeepAliveValueMs = keepAliveOnBadKeepAliveValueMs;
+ }
+
+ public String getUserName() {
+ return userName;
+ }
+
+ public void setUserName(String userName) {
+ this.userName = userName;
+ }
+
+ public String getPassword() {
+ return password;
+ }
+
+ public void setPassword(String password) {
+ this.password = password;
+ }
+
+ public String getNtDomain() {
+ return ntDomain;
+ }
+
+ public void setNtDomain(String ntDomain) {
+ this.ntDomain = ntDomain;
+ }
+
+ public String getAuthScheme() {
+ return authScheme;
+ }
+
+ /**
+ * only basic and ntlm are supported
+ * @param authScheme
+ */
+ public void setAuthScheme(String authScheme) {
+ this.authScheme = authScheme;
+ }
+
+ public HttpClient build() throws TikaConfigException {
+ LOG.info("http client does not verify ssl at this point. " +
+ "If you need that, please open a ticket.");
+ TrustStrategy acceptingTrustStrategy = (cert, authType) -> true;
+ SSLContext sslContext = null;
+ try {
+ sslContext = SSLContexts.custom().loadTrustMaterial(null,
+ acceptingTrustStrategy).build();
+ } catch (NoSuchAlgorithmException | KeyManagementException |
KeyStoreException e) {
+ throw new TikaConfigException("", e);
+ }
+ SSLConnectionSocketFactory sslsf = new
SSLConnectionSocketFactory(sslContext,
+ NoopHostnameVerifier.INSTANCE);
+
+ Registry<ConnectionSocketFactory> socketFactoryRegistry =
+ RegistryBuilder.<ConnectionSocketFactory>create()
+ .register("https", sslsf)
+ .register("http", new PlainConnectionSocketFactory())
+ .build();
+
+ PoolingHttpClientConnectionManager manager =
+ new PoolingHttpClientConnectionManager(socketFactoryRegistry);
+ manager.setDefaultMaxPerRoute(maxConnectionsPerRoute);
+ manager.setMaxTotal(maxConnections);
+
+ HttpClientBuilder builder = HttpClients.custom();
+ addCredentialsProvider(builder);
+ addProxy(builder);
+ return builder.setConnectionManager(manager)
+ .setRedirectStrategy(
+ new CustomRedirectStrategy(allowedHostsForRedirect))
+ .setDefaultRequestConfig(RequestConfig.custom()
+
.setTargetPreferredAuthSchemes(Arrays.asList(AuthSchemes.BASIC,
+ AuthSchemes.NTLM))
+ .setConnectionRequestTimeout((int) requestTimeout)
+ .setConnectionRequestTimeout(connectTimeout)
+ .setSocketTimeout(socketTimeout)
+ .build()
+ )
+ .setKeepAliveStrategy(getKeepAliveStrategy())
+ .setSSLSocketFactory(sslsf)
+ .setSSLHostnameVerifier(NoopHostnameVerifier.INSTANCE)
+ .build();
+ }
+
+ private void addProxy(HttpClientBuilder builder) {
+ if (!StringUtils.isBlank(proxyHost)) {
+ HttpHost proxy = new HttpHost(proxyHost, proxyPort);
+ DefaultProxyRoutePlanner proxyRoutePlanner = new
DefaultProxyRoutePlanner(proxy);
+ builder.setRoutePlanner(proxyRoutePlanner);
+ }
+ }
+
+ private void addCredentialsProvider(HttpClientBuilder builder) throws
TikaConfigException {
+
+ if (StringUtils.isBlank(userName) && StringUtils.isBlank(password)) {
+ return;
+ }
+
+ if ((StringUtils.isBlank(userName) && StringUtils.isBlank(password)) ||
+ (StringUtils.isBlank(password) &&
StringUtils.isBlank(userName))) {
+ throw new IllegalArgumentException("can't have one of 'username',
" +
+ "'password' null and the other not");
+ }
+
+ String finalUserName = decrypt(userName);
+ String finalPassword = decrypt(password);
+ String finalDomain = decrypt(ntDomain);
+ CredentialsProvider provider = new BasicCredentialsProvider();
+ Credentials credentials = null;
+ Registry<AuthSchemeProvider> authSchemeRegistry = null;
+ if (authScheme.equals("basic")) {
+ credentials = new UsernamePasswordCredentials(finalUserName,
finalPassword);
+ authSchemeRegistry = RegistryBuilder
+ .<AuthSchemeProvider>create()
+ .register("basic", new BasicSchemeFactory())
+ .build();
+ } else if (authScheme.equals("ntlm")) {
+ if (StringUtils.isBlank(ntDomain)) {
+ throw new IllegalArgumentException("must specify
'ntDomain'");
+ }
+ credentials = new NTCredentials(finalUserName, finalPassword,
null, finalDomain);
+ authSchemeRegistry =
RegistryBuilder.<AuthSchemeProvider>create()
+ .register("ntlm", new NTLMSchemeFactory()).build();
+ }
+ provider.setCredentials(AuthScope.ANY, credentials);
+ builder.setDefaultCredentialsProvider(provider);
+ builder.setDefaultAuthSchemeRegistry(authSchemeRegistry);
+
+ }
+
+ private String decrypt(String encrypted) throws TikaConfigException {
+ if (! credentialsAESEncrypted) {
+ return encrypted;
+ }
+ if (encrypted == null) {
+ return encrypted;
+ }
+ if (aes == null) {
+ aes = new AES();
+ }
+ return aes.decrypt(encrypted);
+ }
+
+ //if there's a bad/missing keepalive strategy
+ public ConnectionKeepAliveStrategy getKeepAliveStrategy() {
+ return new ConnectionKeepAliveStrategy() {
+
+ public long getKeepAliveDuration(HttpResponse response,
HttpContext context) {
+ // Honor 'keep-alive' header
+ HeaderElementIterator it = new BasicHeaderElementIterator(
+ response.headerIterator(HTTP.CONN_KEEP_ALIVE));
+ while (it.hasNext()) {
+ HeaderElement he = it.nextElement();
+ String param = he.getName();
+ String value = he.getValue();
+ if (value != null && param != null &&
+ param.equalsIgnoreCase("timeout")) {
+ try {
+ return Long.parseLong(value) * 1000;
+ } catch (NumberFormatException ignore) {
+ }
+ }
+ }
+ return keepAliveOnBadKeepAliveValueMs;
+ }
+ };
+ }
+
+ private static class CustomRedirectStrategy extends LaxRedirectStrategy {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(CustomRedirectStrategy.class);
+ private Set<String> allowedHosts;
+
+ public CustomRedirectStrategy(Set<String> allowedHosts) {
+ this.allowedHosts = allowedHosts;
+ }
+
+ @Override
+ protected URI createLocationURI(final String location) throws
ProtocolException {
+ String newLocation = location;
+ try {
+ new URI(newLocation);
+ } catch (final URISyntaxException ex) {
+ LOG.warn("Redirected URL: [ " + newLocation + " ] will be
encoded");
+ try {
+ newLocation = URLEncoder.encode(newLocation,
StandardCharsets.UTF_8.name());
+ } catch (UnsupportedEncodingException e) {
+ LOG.warn("Well, that didn't work out... :(");
+ }
+ }
+ return super.createLocationURI(newLocation);
+ }
+
+ @Override
+ public boolean isRedirected(HttpRequest request, HttpResponse
response, HttpContext context) throws ProtocolException {
+ boolean isRedirectedSuper = super.isRedirected(request, response,
context);
+ if (isRedirectedSuper) {
+ Header locationHeader = response.getFirstHeader("Location");
+ String location = locationHeader.getValue();
+ if (StringUtils.isBlank(location)) {
+ return false;
+ }
+ URI uri;
+ try {
+ uri = new URI(location);
+ } catch (URISyntaxException e) {
+ return true;
+ }
+ if (!allowedHosts.isEmpty() &&
!allowedHosts.contains(uri.getHost())) {
+ LOG.info("Not allowing external redirect. OriginalUrl={},"
+
+ " RedirectLocation={}",
request.getRequestLine().getUri(), location);
+ return false;
+ }
+ }
+ return isRedirectedSuper;
+ }
+ }
+
+ private class AES {
+ private final SecretKeySpec secretKey;
+ private byte[] key;
+
+ private AES() throws TikaConfigException {
+ secretKey = setKey(System.getenv("AES_KEY"));
+ }
+
+ private SecretKeySpec setKey(String myKey) throws TikaConfigException {
+ MessageDigest sha = null;
+ try {
+ key = myKey.getBytes(StandardCharsets.UTF_8);
+ sha = MessageDigest.getInstance("SHA-1");
+ key = sha.digest(key);
+ key = Arrays.copyOf(key, 16);
+ return new SecretKeySpec(key, "AES");
+ } catch (NoSuchAlgorithmException e) {
+ throw new TikaConfigException("bad key", e);
+ }
+ }
+
+ public String encrypt(String strToEncrypt) throws TikaConfigException {
+ try {
+ Cipher cipher = Cipher.getInstance("AES/ECB/PKCS5Padding");
+ cipher.init(Cipher.ENCRYPT_MODE, secretKey);
+ return Base64.getEncoder()
+
.encodeToString(cipher.doFinal(strToEncrypt.getBytes(StandardCharsets.UTF_8)));
+ } catch
(NoSuchAlgorithmException|InvalidKeyException|NoSuchPaddingException|BadPaddingException|IllegalBlockSizeException
e) {
+ throw new TikaConfigException("bad encryption info", e);
+ }
+ }
+
+ public String decrypt(String strToDecrypt) throws TikaConfigException {
+ try {
+ Cipher cipher = Cipher.getInstance("AES/ECB/PKCS5PADDING");
+ cipher.init(Cipher.DECRYPT_MODE, secretKey);
+ return new
String(cipher.doFinal(Base64.getDecoder().decode(strToDecrypt)),
+ StandardCharsets.UTF_8);
+ } catch (NoSuchAlgorithmException|
+ InvalidKeyException|
+ NoSuchPaddingException|
+ BadPaddingException|
+ IllegalBlockSizeException e) {
+ throw new TikaConfigException("bad encryption info", e);
+ }
+ }
+ }
+}
diff --git
a/tika-pipes/tika-httpclient-commons/src/main/java/org/apache/tika/client/HttpClientUtil.java
b/tika-pipes/tika-httpclient-commons/src/main/java/org/apache/tika/client/HttpClientUtil.java
index 2e6fef7..780179c 100644
---
a/tika-pipes/tika-httpclient-commons/src/main/java/org/apache/tika/client/HttpClientUtil.java
+++
b/tika-pipes/tika-httpclient-commons/src/main/java/org/apache/tika/client/HttpClientUtil.java
@@ -19,9 +19,7 @@ package org.apache.tika.client;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpPost;
-import org.apache.http.entity.BasicHttpEntity;
import org.apache.http.entity.ByteArrayEntity;
-import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
@@ -29,15 +27,13 @@ import java.nio.charset.StandardCharsets;
public class HttpClientUtil {
- private static HttpClient CLIENT = HttpClients.createDefault();
-
- public static boolean postJson(String url, String json) throws IOException,
+ public static boolean postJson(HttpClient client, String url, String json)
throws IOException,
TikaClientException {
HttpPost post = new HttpPost(url);
ByteArrayEntity entity = new
ByteArrayEntity(json.getBytes(StandardCharsets.UTF_8));
post.setEntity(entity);
post.setHeader("Content-Type", "application/json");
- HttpResponse response = CLIENT.execute(post);
+ HttpResponse response = client.execute(post);
if (response.getStatusLine().getStatusCode() != 200) {
@@ -51,4 +47,5 @@ public class HttpClientUtil {
}
return true;
}
+
}
diff --git a/tika-server/tika-server-client/pom.xml
b/tika-server/tika-server-client/pom.xml
index 23e304f..d3daf68 100644
--- a/tika-server/tika-server-client/pom.xml
+++ b/tika-server/tika-server-client/pom.xml
@@ -58,4 +58,72 @@
</dependency>
</dependencies>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+
<Automatic-Module-Name>org.apache.tika.server.client</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <artifactId>maven-shade-plugin</artifactId>
+ <version>${maven.shade.version}</version>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>shade</goal>
+ </goals>
+ <configuration>
+ <createDependencyReducedPom>
+ false
+ </createDependencyReducedPom>
+ <!-- <filters> -->
+ <filters>
+ <filter>
+ <artifact>*:*</artifact>
+ <excludes>
+ <exclude>META-INF/*</exclude>
+ <exclude>LICENSE.txt</exclude>
+ <exclude>NOTICE.txt</exclude>
+ </excludes>
+ </filter>
+ </filters>
+ <transformers>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+
<mainClass>org.apache.tika.server.client.TikaClientCLI</mainClass>
+ </transformer>
+
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/LICENSE</resource>
+
<file>target/classes/META-INF/LICENSE</file>
+ </transformer>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/NOTICE</resource>
+ <file>target/classes/META-INF/NOTICE</file>
+ </transformer>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ <resource>META-INF/DEPENDENCIES</resource>
+
<file>target/classes/META-INF/DEPENDENCIES</file>
+ </transformer>
+ </transformers>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
</project>
\ No newline at end of file
diff --git
a/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaClientConfig.java
b/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaClientConfig.java
deleted file mode 100644
index 7034c89..0000000
---
a/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaClientConfig.java
+++ /dev/null
@@ -1,85 +0,0 @@
-package org.apache.tika.server.client;
-
-import org.apache.tika.config.Param;
-import org.apache.tika.config.ServiceLoader;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.mime.MimeTypeException;
-import org.apache.tika.mime.MimeTypes;
-import org.apache.tika.pipes.fetchiterator.EmptyFetchIterator;
-import org.apache.tika.pipes.fetchiterator.FetchIterator;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.xml.sax.SAXException;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.lang.reflect.InvocationTargetException;
-import java.net.URL;
-import java.nio.file.Path;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-public class TikaClientConfig extends TikaConfig {
- public TikaClientConfig(String file) throws TikaException, IOException,
SAXException {
- super(file);
- }
-
- public TikaClientConfig(Path path) throws TikaException, IOException,
SAXException {
- super(path);
- }
-
- public TikaClientConfig(Path path, ServiceLoader loader) throws
TikaException, IOException, SAXException {
- super(path, loader);
- }
-
- public TikaClientConfig(File file) throws TikaException, IOException,
SAXException {
- super(file);
- }
-
- public TikaClientConfig(File file, ServiceLoader loader) throws
TikaException, IOException, SAXException {
- super(file, loader);
- }
-
- public TikaClientConfig(URL url) throws TikaException, IOException,
SAXException {
- super(url);
- }
-
- public TikaClientConfig(URL url, ClassLoader loader) throws TikaException,
IOException, SAXException {
- super(url, loader);
- }
-
- public TikaClientConfig(URL url, ServiceLoader loader) throws
TikaException, IOException, SAXException {
- super(url, loader);
- }
-
- public TikaClientConfig(InputStream stream) throws TikaException,
IOException, SAXException {
- super(stream);
- }
-
- public TikaClientConfig(Document document) throws TikaException,
IOException {
- super(document);
- }
-
- public TikaClientConfig(Document document, ServiceLoader loader) throws
TikaException, IOException {
- super(document, loader);
- }
-
- public TikaClientConfig(Element element) throws TikaException, IOException
{
- super(element);
- }
-
- public TikaClientConfig(Element element, ClassLoader loader) throws
TikaException, IOException {
- super(element, loader);
- }
-
- public TikaClientConfig(ClassLoader loader) throws MimeTypeException,
IOException {
- super(loader);
- }
-
- public TikaClientConfig() throws TikaException, IOException {
- }
-
-}
diff --git
a/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaClientConfigException.java
b/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaClientConfigException.java
index 2144619..97776b8 100644
---
a/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaClientConfigException.java
+++
b/tika-server/tika-server-client/src/main/java/org/apache/tika/server/client/TikaClientConfigException.java
@@ -19,6 +19,7 @@ package org.apache.tika.server.client;
import org.apache.tika.exception.TikaException;
public class TikaClientConfigException extends TikaException {
+
public TikaClientConfigException(String msg) {
super(msg);
}