This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3226
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-3226 by this push:
new 86fc847 TIKA-3226 -- WIP do not merge
86fc847 is described below
commit 86fc84756dc58e683179c0b64cdb395b82c2f5dd
Author: tballison <[email protected]>
AuthorDate: Tue Jan 19 17:03:53 2021 -0500
TIKA-3226 -- WIP do not merge
---
pom.xml | 4 +-
.../java/org/apache/tika/config/TikaConfig.java | 114 +++++++++++++-
.../DefaultEmitter.java} | 44 +++---
.../main/java/org/apache/tika/emitter/Emitter.java | 5 +-
.../org/apache/tika/fetcher/DefaultFetcher.java | 13 +-
.../org/apache/tika/fetcher/SimpleUrlFetcher.java | 61 ++++++++
.../metadata/filter/FieldNameMappingFilter.java | 82 ++++++++++
tika-emitters/pom.xml | 2 +
.../apache/tika/emitter/fs/FileSystemEmitter.java | 9 +-
tika-emitters/tika-emitter-solr/pom.xml | 14 ++
.../org/apache/tika/emitter/solr/SolrEmitter.java | 166 ++++++++++++++++++---
.../org/apache/tika/emitter/solr/TestBasic.java | 43 ++++++
.../src/test/resources/log4j.properties | 24 +++
.../test/resources/tika-config-simple-emitter.xml | 54 +++++++
.../pom.xml | 12 +-
.../org/apache/tika/client/HttpClientUtil.java | 38 +++++
.../apache/tika/client/TikaClientException.java | 14 ++
.../apache/tika/server/classic/FetcherTest.java | 101 +++++++++++++
.../resources/config/tika-config-url-fetcher.xml | 32 ++--
.../server/core/DefaultInputStreamFactory.java | 5 -
...treamFactory.java => FetcherStreamFactory.java} | 38 +++--
.../tika/server/core/InputStreamFactory.java | 1 -
.../org/apache/tika/server/core/TikaServerCli.java | 75 +++++++---
.../tika/server/core/resource/EmitterResource.java | 115 ++++++++++++++
.../core/resource/RecursiveMetadataResource.java | 25 +++-
.../org/apache/tika/server/core/CXFTestBase.java | 6 +-
26 files changed, 961 insertions(+), 136 deletions(-)
diff --git a/pom.xml b/pom.xml
index 0547122..a31f4fe 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="UTF-8"?>
+ <?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
@@ -37,6 +37,7 @@
<modules>
<module>tika-parent</module>
<module>tika-core</module>
+ <module>tika-emitters</module>
<module>tika-fetchers</module>
<module>tika-parsers</module>
<module>tika-bundles</module>
@@ -51,7 +52,6 @@
<module>tika-example</module>
<module>tika-java7</module>
<module>tika-eval</module>
- <module>tika-emitters</module>
</modules>
<profiles>
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index 20742d8..be82195 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -46,6 +46,8 @@ import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.DefaultEncodingDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.emitter.DefaultEmitter;
+import org.apache.tika.emitter.Emitter;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.fetcher.DefaultFetcher;
@@ -69,6 +71,8 @@ import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.multiple.AbstractMultipleParser;
import org.apache.tika.utils.AnnotationUtils;
import org.apache.tika.utils.XMLReaderUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
@@ -115,7 +119,11 @@ public class TikaConfig {
}
private static Fetcher getDefaultFetcher(ServiceLoader loader) {
- return new DefaultFetcher(loader);
+ return new DefaultFetcher(Collections.EMPTY_LIST);
+ }
+
+ private static Emitter getDefaultEmitter(ServiceLoader loader) {
+ return new DefaultEmitter(Collections.EMPTY_LIST);
}
//use this to look for unneeded instantiations of TikaConfig
@@ -131,6 +139,7 @@ public class TikaConfig {
private final EncodingDetector encodingDetector;
private final MetadataFilter metadataFilter;
private final Fetcher fetcher;
+ private final Emitter emitter;
public TikaConfig(String file)
throws TikaException, IOException, SAXException {
@@ -198,6 +207,7 @@ public class TikaConfig {
EncodingDetectorXmlLoader encodingDetectorXmlLoader = new
EncodingDetectorXmlLoader();
MetadataFilterXmlLoader metadataFilterXmlLoader = new
MetadataFilterXmlLoader();
FetcherXmlLoader fetcherXmlLoader = new FetcherXmlLoader();
+ EmitterXmlLoader emitterXmlLoader = new EmitterXmlLoader();
updateXMLReaderUtils(element);
this.mimeTypes = typesFromDomElement(element);
this.detector = detectorLoader.loadOverall(element, mimeTypes, loader);
@@ -209,6 +219,7 @@ public class TikaConfig {
this.executorService = executorLoader.loadOverall(element, mimeTypes,
loader);
this.metadataFilter = metadataFilterXmlLoader.loadOverall(element,
mimeTypes, loader);
this.fetcher = fetcherXmlLoader.loadOverall(element, mimeTypes,
loader);
+ this.emitter = emitterXmlLoader.loadOverall(element, mimeTypes,
loader);
this.serviceLoader = loader;
TIMES_INSTANTIATED.incrementAndGet();
}
@@ -236,6 +247,7 @@ public class TikaConfig {
this.executorService = getDefaultExecutorService();
this.metadataFilter = getDefaultMetadataFilter(serviceLoader);
this.fetcher = getDefaultFetcher(serviceLoader);
+ this.emitter = getDefaultEmitter(serviceLoader);
TIMES_INSTANTIATED.incrementAndGet();
}
@@ -273,6 +285,7 @@ public class TikaConfig {
this.executorService = getDefaultExecutorService();
this.metadataFilter = getDefaultMetadataFilter(serviceLoader);
this.fetcher = getDefaultFetcher(serviceLoader);
+ this.emitter = getDefaultEmitter(serviceLoader);
} else {
ServiceLoader tmpServiceLoader = new ServiceLoader();
try (InputStream stream = getConfigInputStream(config,
tmpServiceLoader)) {
@@ -285,6 +298,7 @@ public class TikaConfig {
ExecutorServiceXmlLoader executorLoader = new
ExecutorServiceXmlLoader();
MetadataFilterXmlLoader metadataFilterXmlLoader = new
MetadataFilterXmlLoader();
FetcherXmlLoader fetcherXmlLoader = new FetcherXmlLoader();
+ EmitterXmlLoader emitterXmlLoader = new EmitterXmlLoader();
this.mimeTypes = typesFromDomElement(element);
this.encodingDetector =
encodingDetectorLoader.loadOverall(element, mimeTypes, serviceLoader);
@@ -297,6 +311,7 @@ public class TikaConfig {
this.executorService = executorLoader.loadOverall(element,
mimeTypes, serviceLoader);
this.metadataFilter =
metadataFilterXmlLoader.loadOverall(element, mimeTypes, serviceLoader);
this.fetcher = fetcherXmlLoader.loadOverall(element,
mimeTypes, serviceLoader);
+ this.emitter = emitterXmlLoader.loadOverall(element,
mimeTypes, serviceLoader);
} catch (SAXException e) {
throw new TikaException(
"Specified Tika configuration has syntax errors: "
@@ -429,6 +444,10 @@ public class TikaConfig {
return fetcher;
}
+ public Emitter getEmitter() {
+ return emitter;
+ }
+
/**
* Provides a default configuration (TikaConfig). Currently creates a
* new instance each time it's called; we may be able to have it
@@ -1362,7 +1381,6 @@ public class TikaConfig {
me.printStackTrace();
}
}
-
return fetcher;
}
@@ -1371,4 +1389,96 @@ public class TikaConfig {
return created; // No decoration of MetadataFilters
}
}
+
+ private static class EmitterXmlLoader extends
+ XmlLoader<Emitter, Emitter> {
+
+ boolean supportsComposite() {
+ return true;
+ }
+
+ String getParentTagName() {
+ return "emitters";
+ }
+
+ String getLoaderTagName() {
+ return "emitter";
+ }
+
+ @Override
+ Class<? extends Emitter> getLoaderClass() {
+ return Emitter.class;
+ }
+
+
+ @Override
+ boolean isComposite(Emitter loaded) {
+ return loaded instanceof DefaultEmitter;
+ }
+
+ @Override
+ boolean isComposite(Class<? extends Emitter> loadedClass) {
+ return DefaultFetcher.class.isAssignableFrom(loadedClass);
+ }
+
+ @Override
+ Emitter preLoadOne(Class<? extends Emitter> loadedClass,
+ String classname, MimeTypes mimeTypes) throws
TikaException {
+ // Check for classes which can't be set in config
+ // Continue with normal loading
+ return null;
+ }
+
+ @Override
+ Emitter createDefault(MimeTypes mimeTypes, ServiceLoader loader) {
+ return getDefaultEmitter(loader);
+ }
+
+ private Emitter getDefaultEmitter(ServiceLoader loader) {
+ //TODO: should we allow service loading?
+ return new DefaultEmitter(Collections.EMPTY_LIST);
+ }
+
+ //this ignores the service loader
+ @Override
+ Emitter createComposite(List<Emitter> loaded, MimeTypes mimeTypes,
ServiceLoader loader) {
+ return new DefaultEmitter(loaded);
+ }
+
+ @Override
+ Emitter createComposite(Class<? extends Emitter> emitterClass,
+ List<Emitter> childEmitters,
+ Set<Class<? extends Emitter>> excludeFilters,
+ Map<String, Param> params, MimeTypes
mimeTypes, ServiceLoader loader)
+ throws InvocationTargetException, IllegalAccessException,
+ InstantiationException {
+ Emitter emitter = null;
+ Constructor<? extends Emitter> c;
+
+ // Try the possible default and composite detector constructors
+ if (emitter == null) {
+ try {
+ c = emitterClass.getConstructor(ServiceLoader.class,
Collection.class);
+ emitter = c.newInstance(loader, excludeFilters);
+ } catch (NoSuchMethodException me) {
+ me.printStackTrace();
+ }
+ }
+ if (emitter == null) {
+ try {
+ c = emitterClass.getConstructor(List.class);
+ emitter = c.newInstance(childEmitters);
+ } catch (NoSuchMethodException me) {
+ me.printStackTrace();
+ }
+ }
+
+ return emitter;
+ }
+
+ @Override
+ Emitter decorate(Emitter created, Element element) {
+ return created; // No decoration of emitters yet
+ }
+ }
}
diff --git
a/tika-core/src/main/java/org/apache/tika/fetcher/DefaultFetcher.java
b/tika-core/src/main/java/org/apache/tika/emitter/DefaultEmitter.java
similarity index 58%
copy from tika-core/src/main/java/org/apache/tika/fetcher/DefaultFetcher.java
copy to tika-core/src/main/java/org/apache/tika/emitter/DefaultEmitter.java
index 77e77d7..e221505 100644
--- a/tika-core/src/main/java/org/apache/tika/fetcher/DefaultFetcher.java
+++ b/tika-core/src/main/java/org/apache/tika/emitter/DefaultEmitter.java
@@ -14,10 +14,12 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.fetcher;
+package org.apache.tika.emitter;
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.fetcher.FetchPrefixKeyPair;
+import org.apache.tika.fetcher.Fetcher;
import org.apache.tika.metadata.Metadata;
import java.io.IOException;
@@ -33,48 +35,48 @@ import java.util.concurrent.ConcurrentHashMap;
*
* This does not allow multiple fetchers supporting the same prefix.
*/
-public class DefaultFetcher implements Fetcher {
+public class DefaultEmitter implements Emitter {
- private final Map<String, Fetcher> fetcherMap = new ConcurrentHashMap<>();
+ private final Map<String, Emitter> emitterMap = new ConcurrentHashMap<>();
- private static List<Fetcher> getDefaultFilters(
+ private static List<Emitter> getDefaultFilters(
ServiceLoader loader) {
- return loader.loadStaticServiceProviders(Fetcher.class);
+ return loader.loadStaticServiceProviders(Emitter.class);
}
- public DefaultFetcher(ServiceLoader serviceLoader) {
+ public DefaultEmitter(ServiceLoader serviceLoader) {
this(getDefaultFilters(serviceLoader));
}
- public DefaultFetcher(List<Fetcher> fetchers) {
- for (Fetcher fetcher : fetchers) {
- for (String supportedPrefix : fetcher.getSupportedPrefixes()) {
- if (fetcherMap.containsKey(supportedPrefix)) {
+ public DefaultEmitter(List<Emitter> emitters) {
+ for (Emitter emitter : emitters) {
+ for (String name : emitter.getSupported()) {
+ if (emitterMap.containsKey(name)) {
throw new IllegalArgumentException(
- "Multiple fetchers cannot support the same prefix:
"
- + supportedPrefix);
+ "Multiple emitters cannot support the same name: "
+ + name);
}
- fetcherMap.put(supportedPrefix, fetcher);
+ emitterMap.put(name, emitter);
}
}
}
@Override
- public Set<String> getSupportedPrefixes() {
- return fetcherMap.keySet();
+ public Set<String> getSupported() {
+ return emitterMap.keySet();
}
+
@Override
- public InputStream fetch(String fetchString, Metadata metadata)
+ public void emit(String emitterName, List<Metadata> metadata)
throws IOException, TikaException {
- FetchPrefixKeyPair fetchPrefixKeyPair =
FetchPrefixKeyPair.create(fetchString);
- Fetcher fetcher = fetcherMap.get(fetchPrefixKeyPair.getPrefix());
- if (fetcher == null) {
+ Emitter emitter = emitterMap.get(emitterName);
+ if (emitter == null) {
throw new IllegalArgumentException("Can't find fetcher for prefix:
"+
- fetchPrefixKeyPair.getPrefix());
+ emitterName);
}
- return fetcher.fetch(fetchString, metadata);
+ emitter.emit(emitterName, metadata);
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/emitter/Emitter.java
b/tika-core/src/main/java/org/apache/tika/emitter/Emitter.java
index aa8ee55..23021e3 100644
--- a/tika-core/src/main/java/org/apache/tika/emitter/Emitter.java
+++ b/tika-core/src/main/java/org/apache/tika/emitter/Emitter.java
@@ -5,14 +5,15 @@ import org.apache.tika.metadata.Metadata;
import java.io.IOException;
import java.util.List;
+import java.util.Set;
public interface Emitter {
- String getName();
+ Set<String> getSupported();
//TODO: do we need a key or can we pass that in metadatalist?
//If we do need it, how do we populate it?
- void emit(List<Metadata> metadataList) throws IOException, TikaException;
+ void emit(String emitterName, List<Metadata> metadataList) throws
IOException, TikaException;
//TODO we can add this later?
//void emit(String txt, Metadata metadata) throws IOException,
TikaException;
diff --git
a/tika-core/src/main/java/org/apache/tika/fetcher/DefaultFetcher.java
b/tika-core/src/main/java/org/apache/tika/fetcher/DefaultFetcher.java
index 77e77d7..ac539c0 100644
--- a/tika-core/src/main/java/org/apache/tika/fetcher/DefaultFetcher.java
+++ b/tika-core/src/main/java/org/apache/tika/fetcher/DefaultFetcher.java
@@ -16,7 +16,6 @@
*/
package org.apache.tika.fetcher;
-import org.apache.tika.config.ServiceLoader;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
@@ -31,22 +30,12 @@ import java.util.concurrent.ConcurrentHashMap;
* Utility class that will apply the appropriate fetcher
* to the fetchString based on the prefix.
*
- * This does not allow multiple fetchers supporting the same prefix.
+ * This forbids multiple fetchers supporting the same prefix.
*/
public class DefaultFetcher implements Fetcher {
private final Map<String, Fetcher> fetcherMap = new ConcurrentHashMap<>();
- private static List<Fetcher> getDefaultFilters(
- ServiceLoader loader) {
- return loader.loadStaticServiceProviders(Fetcher.class);
- }
-
-
- public DefaultFetcher(ServiceLoader serviceLoader) {
- this(getDefaultFilters(serviceLoader));
- }
-
public DefaultFetcher(List<Fetcher> fetchers) {
for (Fetcher fetcher : fetchers) {
for (String supportedPrefix : fetcher.getSupportedPrefixes()) {
diff --git
a/tika-core/src/main/java/org/apache/tika/fetcher/SimpleUrlFetcher.java
b/tika-core/src/main/java/org/apache/tika/fetcher/SimpleUrlFetcher.java
new file mode 100644
index 0000000..a68d004
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/fetcher/SimpleUrlFetcher.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.fetcher;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Collections;
+import java.util.Set;
+
+/**
+ * This is a lightweight fetcher that uses Java's
+ * {@link URL#openStream()}. Please consider a more
+ * robust way to fetch URLs, e.g. Apache httpcomponents,
+ * curl or wget...
+ *
+ * This is limited to http: and https: urls. This does
+ * not support the file:/// protocol. See {@link FileSystemFetcher}.
+ */
+public class SimpleUrlFetcher implements Fetcher {
+
+ private static String PREFIX = "url";
+ private static final Set<String> SUPPORTED = Collections.singleton(PREFIX);
+
+ @Override
+ public Set<String> getSupportedPrefixes() {
+ return SUPPORTED;
+ }
+
+ @Override
+ public InputStream fetch(String fetchString, Metadata metadata)
+ throws IOException, TikaException {
+ FetchPrefixKeyPair fetchPrefixKeyPair =
FetchPrefixKeyPair.create(fetchString);
+ URL url = new URL(fetchPrefixKeyPair.getKey());
+ if (! url.getProtocol().equals("http") &&
+ ! url.getProtocol().equals("https") &&
+ ! url.getProtocol().equals("ftp")) {
+ throw new TikaException("This fetcher only handles: http, https;
NOT: "
+ + url.getProtocol());
+ }
+ return TikaInputStream.get(url, metadata);
+ }
+}
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/filter/FieldNameMappingFilter.java
b/tika-core/src/main/java/org/apache/tika/metadata/filter/FieldNameMappingFilter.java
new file mode 100644
index 0000000..c64d784
--- /dev/null
+++
b/tika-core/src/main/java/org/apache/tika/metadata/filter/FieldNameMappingFilter.java
@@ -0,0 +1,82 @@
+package org.apache.tika.metadata.filter;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+public class FieldNameMappingFilter implements MetadataFilter {
+ private static String MAPPING_OPERATOR = "->";
+
+ Map<String, String> mapping = new HashMap<>();
+
+ boolean excludeUnmapped = true;
+
+ @Override
+ public void filter(Metadata metadata) throws TikaException {
+ if (excludeUnmapped) {
+ for (String n : metadata.names()) {
+ if (mapping.containsKey(n)) {
+ String[] vals = metadata.getValues(n);
+ metadata.remove(n);
+ for (String val : vals) {
+ metadata.add(mapping.get(n), val);
+ }
+ } else {
+ mapping.remove(n);
+ }
+ }
+ } else {
+ for (String n : metadata.names()) {
+ if (mapping.containsKey(n)) {
+ String[] vals = metadata.getValues(n);
+ metadata.remove(n);
+ for (String val : vals) {
+ metadata.add(mapping.get(n), val);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * If this is <code>true</code> (default), this means that only the fields
that
+ * have a "from" value in the mapper will be passed through. Otherwise,
+ * this will pass through all keys/values and mutate the keys
+ * that exist in the mappings.
+ * @param excludeUnmapped
+ */
+ @Field
+ public void setExcludeUnmapped(boolean excludeUnmapped) {
+ this.excludeUnmapped = excludeUnmapped;
+ }
+
+ @Field
+ public void setMappings(List<String> mappings) {
+ for (String m : mappings) {
+ String[] args = m.split(MAPPING_OPERATOR);
+ if (args.length == 0 || args.length == 1) {
+ throw new IllegalArgumentException(
+ "Can't find mapping operator '->' in: " + m);
+ } else if (args.length > 2) {
+ throw new IllegalArgumentException(
+ "Must have only one mapping operator. I found more
than one: " + m
+ );
+ }
+ String from = args[0].trim();
+ if (from.length() == 0) {
+ throw new IllegalArgumentException("Must contain content
before the "+
+ "mapping operator '->'");
+ }
+ String to = args[1].trim();
+ if (to.length() == 0) {
+ throw new IllegalArgumentException("Must contain content after
the "+
+ "mapping operator '->'");
+ }
+ mapping.put(from, to);
+ }
+ }
+}
diff --git a/tika-emitters/pom.xml b/tika-emitters/pom.xml
index 21eed2b..42bdf6b 100644
--- a/tika-emitters/pom.xml
+++ b/tika-emitters/pom.xml
@@ -35,7 +35,9 @@
<modules>
<module>tika-emitter-fs</module>
+ <module>tika-httpclient-commons</module>
<module>tika-emitter-solr</module>
+
</modules>
diff --git
a/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/emitter/fs/FileSystemEmitter.java
b/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/emitter/fs/FileSystemEmitter.java
index 5c5016b..a99c013 100644
---
a/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/emitter/fs/FileSystemEmitter.java
+++
b/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/emitter/fs/FileSystemEmitter.java
@@ -14,7 +14,9 @@ import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
+import java.util.Collections;
import java.util.List;
+import java.util.Set;
public class FileSystemEmitter implements Emitter {
@@ -22,14 +24,13 @@ public class FileSystemEmitter implements Emitter {
private Path basePath = null;
private String fileExtension = "json";
-
@Override
- public String getName() {
- return name;
+ public Set<String> getSupported() {
+ return Collections.singleton(name);
}
@Override
- public void emit(List<Metadata> metadataList) throws IOException,
TikaException {
+ public void emit(String emitterName, List<Metadata> metadataList) throws
IOException, TikaException {
Path output;
if (metadataList == null || metadataList.size() == 0) {
throw new TikaEmitterException("metadata list must not be null or
of size 0");
diff --git a/tika-emitters/tika-emitter-solr/pom.xml
b/tika-emitters/tika-emitter-solr/pom.xml
index 8ee76af..6aefb89 100644
--- a/tika-emitters/tika-emitter-solr/pom.xml
+++ b/tika-emitters/tika-emitter-solr/pom.xml
@@ -36,6 +36,20 @@
<version>${project.version}</version>
<scope>provided</scope>
</dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-httpclient-commons</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
</dependencies>
</project>
\ No newline at end of file
diff --git
a/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/emitter/solr/SolrEmitter.java
b/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/emitter/solr/SolrEmitter.java
index 9f73cc8..e77519e 100644
---
a/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/emitter/solr/SolrEmitter.java
+++
b/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/emitter/solr/SolrEmitter.java
@@ -1,5 +1,9 @@
package org.apache.tika.emitter.solr;
+import com.google.gson.Gson;
+import com.google.gson.JsonArray;
+import com.google.gson.JsonObject;
+import org.apache.tika.client.HttpClientUtil;
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
@@ -8,41 +12,135 @@ import org.apache.tika.emitter.Emitter;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import java.io.IOException;
+import java.util.Collections;
import java.util.List;
import java.util.Map;
+import java.util.Set;
+import java.util.UUID;
public class SolrEmitter implements Emitter, Initializable {
+ enum AttachmentStrategy {
+ SKIP,
+ CONCATENATE_CONTENT,
+ PARENT_CHILD,
+ //anything else?
+ }
+ private static final Gson GSON = new Gson();
+ private static final String ATTACHMENTS = "attachments";
+ private static final String UPDATE_PATH = "/update";
+ private static final Logger LOG =
LoggerFactory.getLogger(SolrEmitter.class);
+
private String name = "solr";
- boolean collapseEmbeddedFiles = false;
+ private AttachmentStrategy attachmentStrategy =
AttachmentStrategy.PARENT_CHILD;
private String url;
+ private String contentField = "content";
+ private String idField = "id";
+ private int commitWithin = 100;
@Override
- public String getName() {
- return name;
+ public void emit(String emitterName, List<Metadata> metadataList) throws
IOException,
+ TikaException {
+ if (metadataList == null || metadataList.size() == 0) {
+ LOG.warn("metadataList is null or empty");
+ return;
+ }
+ String json = jsonify(metadataList);
+ LOG.debug("emitting json:"+json);
+
HttpClientUtil.postJson(url+UPDATE_PATH+"?commitWithin="+getCommitWithin(),
json);
}
- @Override
- public void emit(List<Metadata> metadataList) throws IOException,
- TikaException {
+ private String jsonify(List<Metadata> metadataList) {
+ if (attachmentStrategy == AttachmentStrategy.SKIP) {
+ return toJsonString(jsonify(metadataList.get(0)));
+ } else if (attachmentStrategy ==
AttachmentStrategy.CONCATENATE_CONTENT) {
+ //this only handles text for now, not xhtml
+ StringBuilder sb = new StringBuilder();
+ for (Metadata metadata : metadataList) {
+ String content = metadata.get(getContentField());
+ if (content != null) {
+ sb.append(content).append("\n");
+ }
+ }
+ Metadata parent = metadataList.get(0);
+ parent.set(getContentField(), sb.toString());
+ return toJsonString(jsonify(parent));
+ } else if (attachmentStrategy == AttachmentStrategy.PARENT_CHILD) {
+ if (metadataList.size() == 1) {
+ JsonObject obj = jsonify(metadataList.get(0));
+ return toJsonString(obj);
+ }
+ JsonObject parent = jsonify(metadataList.get(0));
+ JsonArray children = new JsonArray();
+ for (int i = 1; i < metadataList.size(); i++) {
+ Metadata m = metadataList.get(i);
+ m.set(idField, UUID.randomUUID().toString());
+ children.add(jsonify(m));
+ }
+ parent.add(ATTACHMENTS, children);
+ return toJsonString(parent);
+ } else {
+ throw new IllegalArgumentException("I don't yet support this
attachment strategy: "
+ + attachmentStrategy);
+ }
+ }
+ private String toJsonString(JsonObject obj) {
+ //wrap the document into an array
+ //so that Solr correctly interprets this as
+ //upload docs vs a command.
+ JsonArray docs = new JsonArray();
+ docs.add(obj);
+ return GSON.toJson(docs);
+ }
+
+ private JsonObject jsonify(Metadata metadata) {
+ JsonObject obj = new JsonObject();
+ for (String n : metadata.names()) {
+ String[] vals = metadata.getValues(n);
+ if (vals.length == 0) {
+ continue;
+ } else if (vals.length == 1) {
+ obj.addProperty(n, vals[0]);
+ } else if (vals.length > 1) {
+ JsonArray valArr = new JsonArray();
+ for (int i = 0; i < vals.length; i++) {
+ valArr.add(vals[i]);
+ }
+ obj.add(n, valArr);
+ }
+ }
+ return obj;
}
/**
- * If set to true, this concatenates text from all embedded files
- * with the primary document's text but throws out the metadata
- * from the embedded files.
- *
- * If set to false (default), the SolrEmitter will emit attachments
- * as "children" of the parent.
+ * Options: "skip", "concatenate-content", "parent-child". Default is
"parent-child".
+ * If set to "skip", this will index only the main file and ignore all info
+ * in the attachments. If set to "concatenate", this will concatenate the
+ * content extracted from the attachments into the main document and
+ * then index the main document with the concatenated content _and_ the
+ * main document's metadata (metadata from attachments will be thrown
away).
+ * If set to "parent-child", this will index the attachments as children
+ * of the parent document via Solr's parent-child relationship.
*
- * @param collapseEmbeddedFiles
+ * @param attachmentStrategy
*/
@Field
- public void setCollapseEmbeddedFiles(boolean collapseEmbeddedFiles) {
- this.collapseEmbeddedFiles = collapseEmbeddedFiles;
+ public void setAttachmentStrategy(String attachmentStrategy) {
+ if (attachmentStrategy.equals("skip")) {
+ this.attachmentStrategy = AttachmentStrategy.SKIP;
+ } else if (attachmentStrategy.equals("concatenate-content")) {
+ this.attachmentStrategy = AttachmentStrategy.CONCATENATE_CONTENT;
+ } else if (attachmentStrategy.equals("parent-child")) {
+ this.attachmentStrategy = AttachmentStrategy.PARENT_CHILD;
+ } else {
+ throw new IllegalArgumentException("Expected 'skip',
'concatenate-content' or "+
+ "'parent-child'. I regret I do not recognize: " +
attachmentStrategy);
+ }
}
@Field
@@ -55,10 +153,38 @@ public class SolrEmitter implements Emitter, Initializable
{
* @param url
*/
@Field
- public void setSolrUrl(String url) {
+ public void setUrl(String url) {
+ if (url.endsWith("/")) {
+ url = url.substring(0, url.length()-1);
+ }
this.url = url;
}
+ /**
+ * This is the field _after_ metadata mappings have been applied
+ * that contains the "content" for each metadata object.
+ *
+ * This is the field that is used if {@link #attachmentStrategy}
+ * is {@link AttachmentStrategy#CONCATENATE_CONTENT}.
+ * @param contentField
+ */
+ @Field
+ public void setContentField(String contentField) {
+ this.contentField = contentField;
+ }
+
+ public String getContentField() {
+ return contentField;
+ }
+
+ @Field
+ public void setCommitWithin(int commitWithin) {
+ this.commitWithin = commitWithin;
+ }
+
+ public int getCommitWithin() {
+ return commitWithin;
+ }
//TODO: add username/password for authentication?
/**
@@ -69,7 +195,7 @@ public class SolrEmitter implements Emitter, Initializable {
*/
@Field
public void setIdField(String idField) {
-
+ this.idField = idField;
}
@Override
@@ -82,4 +208,10 @@ public class SolrEmitter implements Emitter, Initializable {
public void checkInitialization(InitializableProblemHandler
problemHandler) throws TikaConfigException {
}
+
+ @Override
+ public Set<String> getSupported() {
+ return Collections.singleton(name);
+ }
+
}
diff --git
a/tika-emitters/tika-emitter-solr/src/test/java/org/apache/tika/emitter/solr/TestBasic.java
b/tika-emitters/tika-emitter-solr/src/test/java/org/apache/tika/emitter/solr/TestBasic.java
new file mode 100644
index 0000000..b374476
--- /dev/null
+++
b/tika-emitters/tika-emitter-solr/src/test/java/org/apache/tika/emitter/solr/TestBasic.java
@@ -0,0 +1,43 @@
+package org.apache.tika.emitter.solr;
+
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.emitter.Emitter;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.filter.MetadataFilter;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.junit.Test;
+
+import java.lang.reflect.Array;
+import java.util.ArrayList;
+import java.util.List;
+
+public class TestBasic {
+
+ @Test
+ public void testBasic() throws Exception {
+ TikaConfig tikaConfig = new TikaConfig(
+
TestBasic.class.getResourceAsStream("/tika-config-simple-emitter.xml"));
+ Emitter emitter = tikaConfig.getEmitter();
+ List<Metadata> metadataList = new ArrayList<>();
+ Metadata m1 = new Metadata();
+ m1.set("id", "1");
+ m1.set(Metadata.CONTENT_LENGTH, "314159");
+ m1.set(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, "the quick
brown");
+ m1.set(TikaCoreProperties.TITLE, "this is the first title");
+ m1.add(TikaCoreProperties.CREATOR, "firstAuthor");
+ m1.add(TikaCoreProperties.CREATOR, "secondAuthor");
+
+ Metadata m2 = new Metadata();
+ m2.set(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH,
"/path_to_this.txt");
+ m2.set(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, "fox jumped
over the lazy");
+ MetadataFilter filter = tikaConfig.getMetadataFilter();
+ filter.filter(m1);
+ filter.filter(m2);
+ metadataList.add(m1);
+ metadataList.add(m2);
+
+ emitter.emit("solr1", metadataList);
+ }
+}
diff --git
a/tika-emitters/tika-emitter-solr/src/test/resources/log4j.properties
b/tika-emitters/tika-emitter-solr/src/test/resources/log4j.properties
new file mode 100644
index 0000000..92b6d56
--- /dev/null
+++ b/tika-emitters/tika-emitter-solr/src/test/resources/log4j.properties
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#info,debug, error,fatal ...
+log4j.rootLogger=debug,stderr
+
+#console
+log4j.appender.stderr=org.apache.log4j.ConsoleAppender
+log4j.appender.stderr.layout=org.apache.log4j.PatternLayout
+log4j.appender.stderr.Target=System.err
+
+log4j.appender.stderr.layout.ConversionPattern= %-5p %m%n
diff --git
a/tika-emitters/tika-emitter-solr/src/test/resources/tika-config-simple-emitter.xml
b/tika-emitters/tika-emitter-solr/src/test/resources/tika-config-simple-emitter.xml
new file mode 100644
index 0000000..18c65c6
--- /dev/null
+++
b/tika-emitters/tika-emitter-solr/src/test/resources/tika-config-simple-emitter.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<properties>
+ <metadataFilters>
+ <metadataFilter
class="org.apache.tika.metadata.filter.FieldNameMappingFilter">
+ <params>
+ <param name="mappings" type="list">
+ <string>X-TIKA:content->content</string>
+
<string>X-TIKA:embedded_resource_path->embedded_path</string>
+ <string>Content-Length->length</string>
+ <string>dc:creator->creators</string>
+ <string>dc:title->title</string>
+ </param>
+ </params>
+ </metadataFilter>
+ </metadataFilters>
+ <emitters>
+ <emitter class="org.apache.tika.emitter.solr.SolrEmitter">
+ <params>
+ <param name="name" type="string">solr1</param>
+ <param name="url"
type="string">http://localhost:8983/solr/tika-test</param>
+ <param name="attachmentStrategy"
type="string">concatenate-content</param>
+ <param name="contentField" type="string">content</param>
+ <param name="commitWithin" type="int">10</param>
+ </params>
+ </emitter>
+ <emitter class="org.apache.tika.emitter.solr.SolrEmitter">
+ <params>
+ <param name="name" type="string">solr2</param>
+ <param name="url"
type="string">http://localhost:8983/solr/tika-test</param>
+ <param name="attachmentStrategy"
type="string">parent-child</param>
+ <param name="contentField" type="string">content</param>
+ <param name="commitWithin" type="int">10</param>
+ </params>
+ </emitter>
+ </emitters>
+</properties>
\ No newline at end of file
diff --git a/tika-emitters/tika-emitter-solr/pom.xml
b/tika-emitters/tika-httpclient-commons/pom.xml
similarity index 77%
copy from tika-emitters/tika-emitter-solr/pom.xml
copy to tika-emitters/tika-httpclient-commons/pom.xml
index 8ee76af..8a5fa89 100644
--- a/tika-emitters/tika-emitter-solr/pom.xml
+++ b/tika-emitters/tika-httpclient-commons/pom.xml
@@ -27,7 +27,7 @@
</parent>
<modelVersion>4.0.0</modelVersion>
- <artifactId>tika-emitter-solr</artifactId>
+ <artifactId>tika-httpclient-commons</artifactId>
<dependencies>
<dependency>
@@ -36,6 +36,16 @@
<version>${project.version}</version>
<scope>provided</scope>
</dependency>
+ <dependency>
+ <groupId>org.apache.httpcomponents</groupId>
+ <artifactId>httpclient</artifactId>
+ <version>${httpcomponents.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.google.code.gson</groupId>
+ <artifactId>gson</artifactId>
+ <version>${gson.version}</version>
+ </dependency>
</dependencies>
</project>
\ No newline at end of file
diff --git
a/tika-emitters/tika-httpclient-commons/src/main/java/org/apache/tika/client/HttpClientUtil.java
b/tika-emitters/tika-httpclient-commons/src/main/java/org/apache/tika/client/HttpClientUtil.java
new file mode 100644
index 0000000..041e7af
--- /dev/null
+++
b/tika-emitters/tika-httpclient-commons/src/main/java/org/apache/tika/client/HttpClientUtil.java
@@ -0,0 +1,38 @@
+package org.apache.tika.client;
+
+import org.apache.http.HttpResponse;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.methods.HttpPost;
+import org.apache.http.entity.BasicHttpEntity;
+import org.apache.http.entity.ByteArrayEntity;
+import org.apache.http.impl.client.HttpClients;
+import org.apache.http.util.EntityUtils;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+
+public class HttpClientUtil {
+
+ private static HttpClient CLIENT = HttpClients.createDefault();
+
+ public static boolean postJson(String url, String json) throws IOException,
+ TikaClientException {
+ HttpPost post = new HttpPost(url);
+ ByteArrayEntity entity = new
ByteArrayEntity(json.getBytes(StandardCharsets.UTF_8));
+ post.setEntity(entity);
+ post.setHeader("Content-Type", "application/json");
+ HttpResponse response = CLIENT.execute(post);
+
+
+ if (response.getStatusLine().getStatusCode() != 200) {
+ String msg = EntityUtils.toString(response.getEntity());
+ throw new TikaClientException("Bad status: " +
+ response.getStatusLine().getStatusCode() + " : "+
+ msg);
+ } else {
+ String msg = EntityUtils.toString(response.getEntity());
+ System.out.println("httputil: " + msg);
+ }
+ return true;
+ }
+}
diff --git
a/tika-emitters/tika-httpclient-commons/src/main/java/org/apache/tika/client/TikaClientException.java
b/tika-emitters/tika-httpclient-commons/src/main/java/org/apache/tika/client/TikaClientException.java
new file mode 100644
index 0000000..1827f3e
--- /dev/null
+++
b/tika-emitters/tika-httpclient-commons/src/main/java/org/apache/tika/client/TikaClientException.java
@@ -0,0 +1,14 @@
+package org.apache.tika.client;
+
+
+import org.apache.tika.exception.TikaException;
+
+public class TikaClientException extends TikaException {
+ public TikaClientException(String msg) {
+ super(msg);
+ }
+
+ public TikaClientException(String msg, Throwable cause) {
+ super(msg, cause);
+ }
+}
diff --git
a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/FetcherTest.java
b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/FetcherTest.java
new file mode 100644
index 0000000..7ff931e
--- /dev/null
+++
b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/FetcherTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server.classic;
+
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.ext.multipart.Attachment;
+import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.server.core.CXFTestBase;
+import org.apache.tika.server.core.DefaultInputStreamFactory;
+import org.apache.tika.server.core.FetcherStreamFactory;
+import org.apache.tika.server.core.InputStreamFactory;
+import org.apache.tika.server.core.resource.RecursiveMetadataResource;
+import org.apache.tika.server.core.resource.TikaResource;
+import org.apache.tika.server.core.writer.MetadataListMessageBodyWriter;
+import org.junit.Test;
+
+import javax.ws.rs.core.MultivaluedHashMap;
+import javax.ws.rs.core.MultivaluedMap;
+import javax.ws.rs.core.Response;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+public class FetcherTest extends CXFTestBase {
+
+ private static final String META_PATH = "/rmeta";
+ private static final String TEXT_PATH = "/text";
+
+ private static final String TEST_RECURSIVE_DOC =
"test-documents/test_recursive_embedded.docx";
+
+ @Override
+ protected void setUpResources(JAXRSServerFactoryBean sf) {
+ sf.setResourceClasses(RecursiveMetadataResource.class);
+ sf.setResourceProvider(RecursiveMetadataResource.class,
+ new SingletonResourceProvider(new
RecursiveMetadataResource()));
+ }
+
+ @Override
+ protected void setUpProviders(JAXRSServerFactoryBean sf) {
+ List<Object> providers = new ArrayList<>();
+ providers.add(new MetadataListMessageBodyWriter());
+ sf.setProviders(providers);
+ }
+
+ @Override
+ protected InputStream getTikaConfigInputStream() {
+ return
getClass().getResourceAsStream("/config/tika-config-url-fetcher.xml");
+ }
+
+ @Override
+ protected InputStreamFactory getInputStreamFactory(TikaConfig tikaConfig) {
+ return new FetcherStreamFactory(tikaConfig.getFetcher());
+ }
+
+ @Test
+ public void testBasic() throws Exception {
+ Response response = WebClient
+ .create(endPoint + META_PATH)
+ .accept("application/json")
+ .acceptEncoding("gzip")
+ .header("fetcherString",
"url:https://tika.apache.org").put("");
+
+ Reader reader = new InputStreamReader(new
GzipCompressorInputStream((InputStream) response.getEntity()), UTF_8);
+ List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+ TikaTest.debug(metadataList);
+ }
+
+}
diff --git a/tika-emitters/tika-emitter-solr/pom.xml
b/tika-server/tika-server-classic/src/test/resources/config/tika-config-url-fetcher.xml
similarity index 50%
copy from tika-emitters/tika-emitter-solr/pom.xml
copy to
tika-server/tika-server-classic/src/test/resources/config/tika-config-url-fetcher.xml
index 8ee76af..ea4079c 100644
--- a/tika-emitters/tika-emitter-solr/pom.xml
+++
b/tika-server/tika-server-classic/src/test/resources/config/tika-config-url-fetcher.xml
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="UTF-8"?>
+<?xml version="1.0" encoding="UTF-8" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@@ -17,25 +17,11 @@
specific language governing permissions and limitations
under the License.
-->
-<project xmlns="http://maven.apache.org/POM/4.0.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <parent>
- <artifactId>tika-emitters</artifactId>
- <groupId>org.apache.tika</groupId>
- <version>2.0.0-SNAPSHOT</version>
- </parent>
- <modelVersion>4.0.0</modelVersion>
-
- <artifactId>tika-emitter-solr</artifactId>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- <scope>provided</scope>
- </dependency>
- </dependencies>
-
-</project>
\ No newline at end of file
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser"/>
+ </parsers>
+ <fetchers>
+ <fetcher class="org.apache.tika.fetcher.SimpleUrlFetcher"/>
+ </fetchers>
+</properties>
\ No newline at end of file
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/DefaultInputStreamFactory.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/DefaultInputStreamFactory.java
index c09b139..022d178 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/DefaultInputStreamFactory.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/DefaultInputStreamFactory.java
@@ -29,11 +29,6 @@ import java.io.InputStream;
public class DefaultInputStreamFactory implements InputStreamFactory {
@Override
- public InputStream getInputSteam(InputStream is, HttpHeaders httpHeaders)
throws IOException {
- return is;
- }
-
- @Override
public InputStream getInputSteam(InputStream is, Metadata metadata,
HttpHeaders httpHeaders) throws IOException {
return is;
}
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/URLEnabledInputStreamFactory.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/FetcherStreamFactory.java
similarity index 70%
rename from
tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/URLEnabledInputStreamFactory.java
rename to
tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/FetcherStreamFactory.java
index bdb71c6..0841004 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/URLEnabledInputStreamFactory.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/FetcherStreamFactory.java
@@ -21,6 +21,8 @@ import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.fetcher.Fetcher;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -40,30 +42,24 @@ import org.apache.tika.metadata.Metadata;
* See <a
href="https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2015-3271">CVE-2015-3271</a>
*
*/
-public class URLEnabledInputStreamFactory implements InputStreamFactory {
+public class FetcherStreamFactory implements InputStreamFactory {
- /**
- * @deprecated use {@link #getInputSteam(InputStream, Metadata,
HttpHeaders)}
- * @param is
- * @param httpHeaders
- * @return
- * @throws IOException
- */
- @Override
- @Deprecated
- public InputStream getInputSteam(InputStream is, HttpHeaders httpHeaders)
throws IOException {
- String fileUrl = httpHeaders.getHeaderString("fileUrl");
- if(fileUrl != null && !"".equals(fileUrl)){
- return TikaInputStream.get(new URL(fileUrl));
- }
- return is;
- }
+ private final Fetcher fetcher;
+ public FetcherStreamFactory(Fetcher fetcher) {
+ this.fetcher = fetcher;
+ }
@Override
- public InputStream getInputSteam(InputStream is, Metadata metadata,
HttpHeaders httpHeaders) throws IOException {
- String fileUrl = httpHeaders.getHeaderString("fileUrl");
- if(fileUrl != null && !"".equals(fileUrl)){
- return TikaInputStream.get(new URL(fileUrl), metadata);
+ public InputStream getInputSteam(InputStream is, Metadata metadata,
+ HttpHeaders httpHeaders) throws
IOException {
+ String fetcherString = httpHeaders.getHeaderString("fetcherString");
+
+ if(fetcherString != null && !"".equals(fetcherString)){
+ try {
+ return fetcher.fetch(fetcherString, metadata);
+ } catch (TikaException e) {
+ throw new IOException(e);
+ }
}
return is;
}
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/InputStreamFactory.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/InputStreamFactory.java
index 4d293c6..66ec45a 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/InputStreamFactory.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/InputStreamFactory.java
@@ -31,7 +31,6 @@ import java.io.InputStream;
*/
public interface InputStreamFactory {
- InputStream getInputSteam(InputStream is, HttpHeaders httpHeaders) throws
IOException;
InputStream getInputSteam(InputStream is, Metadata metadata, HttpHeaders
httpHeaders) throws IOException;
}
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerCli.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerCli.java
index d4773ba..b84b74d 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerCli.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerCli.java
@@ -45,10 +45,12 @@ import
org.apache.cxf.transport.common.gzip.GZIPOutInterceptor;
import org.apache.tika.Tika;
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.emitter.Emitter;
import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.digestutils.BouncyCastleDigester;
import org.apache.tika.parser.digestutils.CommonsDigester;
import org.apache.tika.server.core.resource.DetectorResource;
+import org.apache.tika.server.core.resource.EmitterResource;
import org.apache.tika.server.core.resource.LanguageResource;
import org.apache.tika.server.core.resource.MetadataResource;
import org.apache.tika.server.core.resource.RecursiveMetadataResource;
@@ -85,11 +87,13 @@ public class TikaServerCli {
public static final Set<String> LOG_LEVELS = new
HashSet<>(Arrays.asList("debug", "info"));
private static final Logger LOG =
LoggerFactory.getLogger(TikaServerCli.class);
- private static final String FILE_URL_WARNING =
- "WARNING: You have chosen to run tika-server with fileUrl
enabled.\n"+
+ private static final String UNSECURE_WARNING =
+ "WARNING: You have chosen to run tika-server with unsecure
features enabled.\n"+
"Whoever has access to your service now has the same read
permissions\n"+
- "as tika-server. Users could request and receive a sensitive file
from your\n" +
- "drive or a webpage from your intranet. See CVE-2015-3271.\n"+
+ "as you've given your fetchers and the same write permissions as
your emitters.\n" +
+ "Users could request and receive a sensitive file from your\n" +
+ "drive or a webpage from your intranet and/or send malicious
content to\n" +
+ " your emitter endpoints. See CVE-2015-3271.\n"+
"Please make sure you know what you are doing.";
private static final List<String> ONLY_IN_FORK_MODE =
@@ -111,8 +115,8 @@ public class TikaServerCli {
options.addOption("i", "id", true, "id to use for server in server
status endpoint");
options.addOption("status", false, "enable the status endpoint");
options.addOption("?", "help", false, "this help message");
- options.addOption("enableUnsecureFeatures", false, "this is required
to enable fileUrl.");
- options.addOption("enableFileUrl", false, "allows user to pass in
fileUrl instead of InputStream.");
+ options.addOption("enableUnsecureFeatures", false, "this is required
to enable fetchers and emitters. "+
+ " The user acknowledges that fetchers and emitters introduce
potential security vulnerabilities.");
options.addOption("noFork", false, "legacy mode, less robust -- this
starts up tika-server" +
" without forking a process.");
options.addOption("taskTimeoutMillis", true,
@@ -272,21 +276,14 @@ public class TikaServerCli {
}
}
- if (line.hasOption("enableFileUrl") &&
- !line.hasOption("enableUnsecureFeatures")) {
- System.err.println("If you want to enable fileUrl, you must
also acknowledge the security risks\n"+
- "by including --enableUnsecureFeatures. See CVE-2015-3271.");
- System.exit(-1);
- }
InputStreamFactory inputStreamFactory = null;
- if (line.hasOption("enableFileUrl") &&
- line.hasOption("enableUnsecureFeatures")) {
- inputStreamFactory = new URLEnabledInputStreamFactory();
- System.out.println(FILE_URL_WARNING);
+ if (line.hasOption("enableUnsecureFeatures")) {
+ inputStreamFactory = new
FetcherStreamFactory(tika.getFetcher());
+ LOG.info(UNSECURE_WARNING);
} else {
inputStreamFactory = new DefaultInputStreamFactory();
}
-
+ logFetchersAndEmitters(line.hasOption("enableUnsecureFeatures"),
tika);
String serverId = line.hasOption("i") ? line.getOptionValue("i") :
UUID.randomUUID().toString();
LOG.debug("SERVER ID:" +serverId);
ServerStatus serverStatus;
@@ -329,6 +326,9 @@ public class TikaServerCli {
rCoreProviders.add(new SingletonResourceProvider(new
TikaDetectors()));
rCoreProviders.add(new SingletonResourceProvider(new
TikaParsers()));
rCoreProviders.add(new SingletonResourceProvider(new
TikaVersion()));
+ if (line.hasOption("enableUnsecureFeatures")) {
+ rCoreProviders.add(new SingletonResourceProvider(new
EmitterResource()));
+ }
rCoreProviders.addAll(loadResourceServices());
if (line.hasOption("status")) {
rCoreProviders.add(new SingletonResourceProvider(new
TikaServerStatus(serverStatus)));
@@ -374,6 +374,47 @@ public class TikaServerCli {
LOG.info("Started Apache Tika server at {}", url);
}
+ private static void logFetchersAndEmitters(boolean enableUnsecureFeatures,
TikaConfig tika) {
+ if (enableUnsecureFeatures) {
+ StringBuilder sb = new StringBuilder();
+ Set<String> supportedFetchers =
tika.getFetcher().getSupportedPrefixes();
+ sb.append("enableSecureFeatures has been selected.\n");
+ if (supportedFetchers.size() == 0) {
+ sb.append("There are no fetchers specified in the TikaConfig");
+ } else {
+ sb.append("The following fetchers are available to whomever
has access to this server:\n");
+ for (String p : supportedFetchers) {
+ sb.append(p).append("\n");
+ }
+ }
+ Set<String> emitters = tika.getEmitter().getSupported();
+ if (supportedFetchers.size() == 0) {
+ sb.append("There are no emitters specified in the TikaConfig");
+ } else {
+ sb.append("The following emitters are available to whomever
has access to this server:\n");
+ for (String e : emitters) {
+ sb.append(e).append("\n");
+ }
+ }
+ LOG.info(sb.toString());
+ } else {
+ if (tika.getEmitter().getSupported().size() > 0) {
+ String warn = "-enableUnsecureFeatures has not been specified
on the commandline.\n"+
+ "The "+tika.getEmitter().getSupported().size() + " emitter(s)
that you've\n"+
+ "specified in TikaConfig will not be available on the /emit
endpoint\n"+
+ "To enable your emitters, start tika-server with the
-enableUnsecureFeatures flag\n\n";
+ LOG.warn(warn);
+ }
+ if (tika.getFetcher().getSupportedPrefixes().size() > 0) {
+ String warn = "-enableUnsecureFeatures has not been specified
on the commandline.\n"+
+ "The "+tika.getFetcher().getSupportedPrefixes().size() + "
fetcher(s) that you've\n"+
+ "specified in TikaConfig will not be available\n"+
+ "To enable your fetchers, start tika-server with the
-enableUnsecureFeatures flag\n\n";
+ LOG.warn(warn);
+ }
+ }
+ }
+
private static Collection<? extends ResourceProvider>
loadResourceServices() {
List<TikaServerResource> resources = new
ServiceLoader(TikaServerCli.class.getClassLoader())
.loadServiceProviders(TikaServerResource.class);
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/EmitterResource.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/EmitterResource.java
new file mode 100644
index 0000000..3bb0b0e
--- /dev/null
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/EmitterResource.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server.core.resource;
+
+import org.apache.cxf.jaxrs.ext.multipart.Attachment;
+import org.apache.tika.emitter.Emitter;
+import org.apache.tika.emitter.TikaEmitterException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.server.core.MetadataList;
+import org.apache.tika.utils.ExceptionUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.ws.rs.Consumes;
+import javax.ws.rs.POST;
+import javax.ws.rs.PUT;
+import javax.ws.rs.Path;
+import javax.ws.rs.PathParam;
+import javax.ws.rs.Produces;
+import javax.ws.rs.core.Context;
+import javax.ws.rs.core.HttpHeaders;
+import javax.ws.rs.core.MultivaluedMap;
+import javax.ws.rs.core.Response;
+import javax.ws.rs.core.UriInfo;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import static org.apache.tika.server.core.resource.TikaResource.fillMetadata;
+import static
org.apache.tika.server.core.resource.TikaResource.fillParseContext;
+
+@Path("/emit")
+public class EmitterResource {
+
+ private static final String EMITTER_PARAM = "emitter";
+ private static final Logger LOG =
LoggerFactory.getLogger(EmitterResource.class);
+
+
+ /**
+ * Returns an InputStream that can be deserialized as a list of
+ * {@link Metadata} objects.
+ * The first in the list represents the main document, and the
+ * rest represent metadata for the embedded objects. This works
+ * recursively through all descendants of the main document, not
+ * just the immediate children.
+ * <p>
+ * The extracted text content is stored with the key
+ * {@link
org.apache.tika.sax.AbstractRecursiveParserWrapperHandler#TIKA_CONTENT}
+ * <p>
+ * Must specify an emitter in the path, e.g. /emit/solr
+ * @param info uri info
+ * @param emitterName which emitter to use; emitters must be configured in
+ * the TikaConfig file.
+ * @return InputStream that can be deserialized as a list of {@link
Metadata} objects
+ * @throws Exception
+ */
+
+ @PUT
+ @Produces("application/json")
+ @Path("{" + EMITTER_PARAM + " : (\\w+)?}")
+ public Map<String, String> getMetadata(InputStream is,
+ @Context HttpHeaders httpHeaders,
+ @Context UriInfo info,
+ @PathParam(EMITTER_PARAM) String emitterName
+ ) throws Exception {
+
+ String status = "ok";
+ String exceptionMsg = "";
+ Metadata metadata = new Metadata();
+ List<Metadata> metadataList =
+
RecursiveMetadataResource.parseMetadata(TikaResource.getInputStream(is,
metadata,
+ httpHeaders),
+ metadata,
+
httpHeaders.getRequestHeaders(), info, "text");
+
+ Emitter emitter = TikaResource.getConfig().getEmitter();
+ try {
+ emitter.emit(emitterName, metadataList);
+ } catch (IOException|TikaEmitterException e) {
+ LOG.warn("problem with emitting", e);
+ status = "emitter_exception";
+ exceptionMsg = ExceptionUtils.getStackTrace(e);
+ }
+ Map<String, String> statusMap = new HashMap<>();
+ statusMap.put("status", status);
+ statusMap.put("emitter", emitterName);
+ if (exceptionMsg.length() > 0) {
+ statusMap.put("exception_msg", exceptionMsg);
+ }
+ return statusMap;
+ }
+
+}
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
index e0bc1ca..298d848 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
@@ -29,6 +29,7 @@ import javax.ws.rs.core.MultivaluedMap;
import javax.ws.rs.core.Response;
import javax.ws.rs.core.UriInfo;
import java.io.InputStream;
+import java.util.List;
import org.apache.cxf.jaxrs.ext.multipart.Attachment;
import org.apache.tika.metadata.Metadata;
@@ -84,7 +85,7 @@ public class RecursiveMetadataResource {
@PathParam(HANDLER_TYPE_PARAM)
String handlerTypeName)
throws Exception {
return Response.ok(
- parseMetadata(att.getObject(InputStream.class), new Metadata(),
+ parseMetadataToMetadataList(att.getObject(InputStream.class),
new Metadata(),
att.getHeaders(), info, handlerTypeName)).build();
}
@@ -122,14 +123,17 @@ public class RecursiveMetadataResource {
) throws Exception {
Metadata metadata = new Metadata();
return Response.ok(
- parseMetadata(TikaResource.getInputStream(is, metadata,
httpHeaders),
+ parseMetadataToMetadataList(TikaResource.getInputStream(is,
metadata, httpHeaders),
metadata,
httpHeaders.getRequestHeaders(), info, handlerTypeName)).build();
}
- private MetadataList parseMetadata(InputStream is, Metadata metadata,
- MultivaluedMap<String, String> httpHeaders, UriInfo
info, String handlerTypeName)
- throws Exception {
+ public static List<Metadata> parseMetadata(InputStream is,
+ Metadata metadata,
+ MultivaluedMap<String, String>
httpHeaders,
+ UriInfo info, String
handlerTypeName)
+ throws Exception {
+
final ParseContext context = new ParseContext();
Parser parser = TikaResource.createParser();
// TODO: parameterize choice of max chars/max embedded
attachments
@@ -147,7 +151,7 @@ public class RecursiveMetadataResource {
int maxEmbeddedResources = -1;
if (httpHeaders.containsKey("maxEmbeddedResources")) {
- maxEmbeddedResources =
Integer.parseInt(httpHeaders.getFirst("maxEmbeddedResources"));
+ maxEmbeddedResources =
Integer.parseInt(httpHeaders.getFirst("maxEmbeddedResources"));
}
BasicContentHandlerFactory.HANDLER_TYPE type =
@@ -170,7 +174,14 @@ public class RecursiveMetadataResource {
}
},
*/
- return new MetadataList(handler.getMetadataList());
+ return handler.getMetadataList();
}
+
+ private MetadataList parseMetadataToMetadataList(InputStream is, Metadata
metadata,
+ MultivaluedMap<String, String>
httpHeaders,
+ UriInfo info, String
handlerTypeName)
+ throws Exception {
+ return new MetadataList(parseMetadata(is, metadata, httpHeaders, info,
handlerTypeName));
+ }
}
diff --git
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
index 5a9afbe..47ada93 100644
---
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
+++
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
@@ -93,7 +93,7 @@ public abstract class CXFTestBase {
this.tika = new TikaConfig(getTikaConfigInputStream());
TikaResource.init(tika,
new CommonsDigester(DIGESTER_READ_LIMIT, "md5,sha1:32"),
- new DefaultInputStreamFactory(), new ServerStatus("", 0,true));
+ getInputStreamFactory(tika), new ServerStatus("", 0,true));
JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
//set compression interceptors
sf.setOutInterceptors(
@@ -120,6 +120,10 @@ public abstract class CXFTestBase {
server = sf.create();
}
+ protected InputStreamFactory getInputStreamFactory(TikaConfig tikaConfig) {
+ return new DefaultInputStreamFactory();
+ }
+
protected InputStream getTikaConfigInputStream() {
return new ByteArrayInputStream(
new String("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"+