Repository: tika
Updated Branches:
refs/heads/2.x 673533d0e -> ce1fc3720
* Re-enable fileUrl for tika-server (TIKA-2081). If you choose,
to use this feature, beware of the security vulnerabilities!
See: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2015-3271
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/ce1fc372
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/ce1fc372
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/ce1fc372
Branch: refs/heads/2.x
Commit: ce1fc3720cdcd84b5523b14fc28fd2fb355c9ee2
Parents: 673533d
Author: tballison <[email protected]>
Authored: Fri Sep 23 09:59:41 2016 -0400
Committer: tballison <[email protected]>
Committed: Fri Sep 23 09:59:41 2016 -0400
----------------------------------------------------------------------
CHANGES.txt | 4 ++
.../tika/server/DefaultInputStreamFactory.java | 33 +++++++++++++
.../apache/tika/server/InputStreamFactory.java | 34 +++++++++++++
.../org/apache/tika/server/TikaServerCli.java | 25 +++++++++-
.../server/URLEnabledInputStreamFactory.java | 52 ++++++++++++++++++++
.../tika/server/resource/DetectorResource.java | 2 +-
.../tika/server/resource/MetadataResource.java | 9 ++--
.../resource/RecursiveMetadataResource.java | 6 +--
.../tika/server/resource/TikaResource.java | 20 ++++++--
.../tika/server/resource/UnpackerResource.java | 8 +--
.../org/apache/tika/server/CXFTestBase.java | 4 +-
11 files changed, 178 insertions(+), 19 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/ce1fc372/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index d13a644..362f7e9 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,10 @@ Release 2.0 - ???
Release 1.14 - ???
+ * Re-enable fileUrl for tika-server (TIKA-2081). If you choose,
+ to use this feature, beware of the security vulnerabilities!
+ See: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2015-3271
+
* Add Tesseract's hOCR output format as an option, via Eric Pugh
(TIKA-2093).
http://git-wip-us.apache.org/repos/asf/tika/blob/ce1fc372/tika-server/src/main/java/org/apache/tika/server/DefaultInputStreamFactory.java
----------------------------------------------------------------------
diff --git
a/tika-server/src/main/java/org/apache/tika/server/DefaultInputStreamFactory.java
b/tika-server/src/main/java/org/apache/tika/server/DefaultInputStreamFactory.java
new file mode 100644
index 0000000..a2df856
--- /dev/null
+++
b/tika-server/src/main/java/org/apache/tika/server/DefaultInputStreamFactory.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server;
+
+import javax.ws.rs.core.HttpHeaders;
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Passthrough -- returns InputStream as is
+ */
+public class DefaultInputStreamFactory implements InputStreamFactory {
+
+ @Override
+ public InputStream getInputSteam(InputStream is, HttpHeaders httpHeaders)
throws IOException {
+ return is;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/ce1fc372/tika-server/src/main/java/org/apache/tika/server/InputStreamFactory.java
----------------------------------------------------------------------
diff --git
a/tika-server/src/main/java/org/apache/tika/server/InputStreamFactory.java
b/tika-server/src/main/java/org/apache/tika/server/InputStreamFactory.java
new file mode 100644
index 0000000..27e7f86
--- /dev/null
+++ b/tika-server/src/main/java/org/apache/tika/server/InputStreamFactory.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server;
+
+import javax.ws.rs.core.HttpHeaders;
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Interface to allow for custom/consistent creation of InputStream
+ * <p>
+ * This factory is used statically in TikaResource. Make sure not
+ * to hold instance state in implementations.
+ */
+public interface InputStreamFactory {
+
+ public InputStream getInputSteam(InputStream is, HttpHeaders httpHeaders)
throws IOException;
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/ce1fc372/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
----------------------------------------------------------------------
diff --git
a/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
b/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
index aafde60..6e8fce3 100644
--- a/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
+++ b/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
@@ -68,6 +68,13 @@ public class TikaServerCli {
new HashSet<String>(Arrays.asList("debug", "info"));
private static final Log logger = LogFactory.getLog(TikaServerCli.class);
+ private static final String FILE_URL_WARNING =
+ "WARNING: You have chosen to run tika-server with fileUrl
enabled.\n"+
+ "Whoever has access to your service now has the same read
permissions\n"+
+ "as tika-server. Users could request and receive a sensitive file
from your\n" +
+ "drive or a webpage from your intranet. See CVE-2015-3271.\n"+
+ "Please make sure you know what you are doing.";
+
private static Options getOptions() {
Options options = new Options();
options.addOption("C", "cors", true, "origin allowed to make CORS
requests (default=NONE)\nall allowed if \"all\"");
@@ -79,6 +86,8 @@ public class TikaServerCli {
options.addOption("l", "log", true, "request URI log level ('debug' or
'info')");
options.addOption("s", "includeStack", false, "whether or not to
return a stack trace\nif there is an exception during 'parse'");
options.addOption("?", "help", false, "this help message");
+ options.addOption("enable-unsecure-features", false, "this is required
to enable fileUrl.");
+ options.addOption("enable-fileUrl", false, "allows user to pass in
fileUrl instead of InputStream.");
return options;
}
@@ -166,8 +175,22 @@ public class TikaServerCli {
CommonsDigester.parse(line.getOptionValue("digest")));
}
+ if (line.hasOption("enable-fileUrl") &&
+ !line.hasOption("enable-unsecure-features")) {
+ System.err.println("If you want to enable fileUrl, you must
also acknowledge the security risks\n"+
+ "by including --enable-unsecure-features. See
CVE-2015-3271.");
+ System.exit(-1);
+ }
+ InputStreamFactory inputStreamFactory = null;
+ if (line.hasOption("enable-fileUrl") &&
+ line.hasOption("enable-unsecure-features")) {
+ inputStreamFactory = new URLEnabledInputStreamFactory();
+ System.out.println(FILE_URL_WARNING);
+ } else {
+ inputStreamFactory = new DefaultInputStreamFactory();
+ }
- TikaResource.init(tika, digester);
+ TikaResource.init(tika, digester, inputStreamFactory);
JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
List<ResourceProvider> rCoreProviders = new
ArrayList<ResourceProvider>();
http://git-wip-us.apache.org/repos/asf/tika/blob/ce1fc372/tika-server/src/main/java/org/apache/tika/server/URLEnabledInputStreamFactory.java
----------------------------------------------------------------------
diff --git
a/tika-server/src/main/java/org/apache/tika/server/URLEnabledInputStreamFactory.java
b/tika-server/src/main/java/org/apache/tika/server/URLEnabledInputStreamFactory.java
new file mode 100644
index 0000000..10d4180
--- /dev/null
+++
b/tika-server/src/main/java/org/apache/tika/server/URLEnabledInputStreamFactory.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.server;
+
+import javax.ws.rs.core.HttpHeaders;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+
+import org.apache.tika.io.TikaInputStream;
+
+/**
+ * This class looks for "fileUrl" in the http header. If it is not
null
+ * and not empty, this will return a new TikaInputStream from the URL.
+ * <p>
+ * This is not meant to be used in place of a robust, responsible crawler.
Rather, this
+ * is a convenience factory.
+ * <p>
+ * <em>WARNING:</em> Unless you carefully lock down access to the server,
+ * whoever has access to this service will have the read access of the server.
+ * In short, anyone with access to this service could request and get
+ * "file:///etc/supersensitive_file_dont_read.txt". Or, if your
server has access
+ * to your intranet, and you let the public hit this service, they will now
+ * have access to your intranet.
+ * See <a
href="https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2015-3271">CVE-2015-3271</a>
+ *
+ */
+public class URLEnabledInputStreamFactory implements InputStreamFactory {
+
+ @Override
+ public InputStream getInputSteam(InputStream is, HttpHeaders httpHeaders)
throws IOException {
+ String fileUrl = httpHeaders.getHeaderString("fileUrl");
+ if(fileUrl != null && !"".equals(fileUrl)){
+ return TikaInputStream.get(new URL(fileUrl));
+ }
+ return is;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/ce1fc372/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java
----------------------------------------------------------------------
diff --git
a/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java
b/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java
index f1f5a29..9f19ad6 100644
---
a/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java
+++
b/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java
@@ -46,7 +46,7 @@ public class DetectorResource {
public String detect(final InputStream is,
@Context HttpHeaders httpHeaders, @Context final
UriInfo info) {
Metadata met = new Metadata();
- TikaInputStream tis = TikaInputStream.get(is);
+ TikaInputStream tis =
TikaInputStream.get(TikaResource.getInputStream(is, httpHeaders));
String filename = TikaResource.detectFilename(httpHeaders
.getRequestHeaders());
logger.info("Detecting media type for Filename: " + filename);
http://git-wip-us.apache.org/repos/asf/tika/blob/ce1fc372/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
----------------------------------------------------------------------
diff --git
a/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
b/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
index 89d35e8..e5e5a1f 100644
---
a/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
+++
b/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
@@ -17,9 +17,6 @@
package org.apache.tika.server.resource;
-import java.io.IOException;
-import java.io.InputStream;
-
import javax.ws.rs.Consumes;
import javax.ws.rs.POST;
import javax.ws.rs.PUT;
@@ -31,6 +28,8 @@ import javax.ws.rs.core.HttpHeaders;
import javax.ws.rs.core.MultivaluedMap;
import javax.ws.rs.core.Response;
import javax.ws.rs.core.UriInfo;
+import java.io.IOException;
+import java.io.InputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -58,7 +57,7 @@ public class MetadataResource {
@Produces({"text/csv", "application/json", "application/rdf+xml"})
public Response getMetadata(InputStream is, @Context HttpHeaders
httpHeaders, @Context UriInfo info) throws Exception {
return Response.ok(
- parseMetadata(is, httpHeaders.getRequestHeaders(),
info)).build();
+ parseMetadata(TikaResource.getInputStream(is, httpHeaders),
httpHeaders.getRequestHeaders(), info)).build();
}
/**
@@ -94,7 +93,7 @@ public class MetadataResource {
Response.Status defaultErrorResponse = Response.Status.BAD_REQUEST;
Metadata metadata = null;
try {
- metadata = parseMetadata(is, httpHeaders.getRequestHeaders(),
info);
+ metadata = parseMetadata(TikaResource.getInputStream(is,
httpHeaders), httpHeaders.getRequestHeaders(), info);
// once we've parsed the document successfully, we should use
NOT_FOUND
// if we did not see the field
defaultErrorResponse = Response.Status.NOT_FOUND;
http://git-wip-us.apache.org/repos/asf/tika/blob/ce1fc372/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
----------------------------------------------------------------------
diff --git
a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
index aa4e0ab..b967f8b 100644
---
a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
+++
b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
@@ -17,8 +17,6 @@
package org.apache.tika.server.resource;
-import java.io.InputStream;
-
import javax.ws.rs.Consumes;
import javax.ws.rs.POST;
import javax.ws.rs.PUT;
@@ -30,6 +28,7 @@ import javax.ws.rs.core.HttpHeaders;
import javax.ws.rs.core.MultivaluedMap;
import javax.ws.rs.core.Response;
import javax.ws.rs.core.UriInfo;
+import java.io.InputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -118,7 +117,8 @@ public class RecursiveMetadataResource {
@PathParam(HANDLER_TYPE_PARAM) String
handlerTypeName
) throws Exception {
return Response.ok(
- parseMetadata(is, httpHeaders.getRequestHeaders(), info,
handlerTypeName)).build();
+ parseMetadata(TikaResource.getInputStream(is, httpHeaders),
+
httpHeaders.getRequestHeaders(), info, handlerTypeName)).build();
}
private MetadataList parseMetadata(InputStream is,
http://git-wip-us.apache.org/repos/asf/tika/blob/ce1fc372/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
----------------------------------------------------------------------
diff --git
a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index d74ef74..bbff5bf 100644
---
a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++
b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -73,6 +73,7 @@ import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.apache.tika.server.RichTextContentHandler;
+import org.apache.tika.server.InputStreamFactory;
import org.apache.tika.server.TikaServerParseException;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -88,10 +89,13 @@ public class TikaResource {
private static TikaConfig tikaConfig;
private static DigestingParser.Digester digester = null;
+ private static InputStreamFactory inputStreamFactory = null;
- public static void init(TikaConfig config, DigestingParser.Digester
digestr) {
+ public static void init(TikaConfig config, DigestingParser.Digester
digestr,
+ InputStreamFactory iSF) {
tikaConfig = config;
digester = digestr;
+ inputStreamFactory = iSF;
}
static {
@@ -172,6 +176,14 @@ public class TikaResource {
}
}
+ public static InputStream getInputStream(InputStream is, HttpHeaders
headers) {
+ try {
+ return inputStreamFactory.getInputSteam(is, headers);
+ } catch (IOException e) {
+ throw new TikaServerParseException(e);
+ }
+ }
+
/**
* Utility method to set a property on a class via reflection.
*
@@ -337,7 +349,7 @@ public class TikaResource {
@Consumes("*/*")
@Produces("text/plain")
public StreamingOutput getText(final InputStream is, @Context HttpHeaders
httpHeaders, @Context final UriInfo info) {
- return produceText(is, httpHeaders.getRequestHeaders(), info);
+ return produceText(getInputStream(is, httpHeaders),
httpHeaders.getRequestHeaders(), info);
}
public StreamingOutput produceText(final InputStream is,
MultivaluedMap<String, String> httpHeaders, final UriInfo info) {
@@ -375,7 +387,7 @@ public class TikaResource {
@Consumes("*/*")
@Produces("text/html")
public StreamingOutput getHTML(final InputStream is, @Context HttpHeaders
httpHeaders, @Context final UriInfo info) {
- return produceOutput(is, httpHeaders.getRequestHeaders(), info,
"html");
+ return produceOutput(getInputStream(is, httpHeaders),
httpHeaders.getRequestHeaders(), info, "html");
}
@POST
@@ -390,7 +402,7 @@ public class TikaResource {
@Consumes("*/*")
@Produces("text/xml")
public StreamingOutput getXML(final InputStream is, @Context HttpHeaders
httpHeaders, @Context final UriInfo info) {
- return produceOutput(is, httpHeaders.getRequestHeaders(), info, "xml");
+ return produceOutput(getInputStream(is, httpHeaders),
httpHeaders.getRequestHeaders(), info, "xml");
}
private StreamingOutput produceOutput(final InputStream is, final
MultivaluedMap<String, String> httpHeaders,
http://git-wip-us.apache.org/repos/asf/tika/blob/ce1fc372/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
----------------------------------------------------------------------
diff --git
a/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
b/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
index cf3a0e9..57148ec 100644
---
a/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
+++
b/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
@@ -17,6 +17,8 @@
package org.apache.tika.server.resource;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import javax.ws.rs.PUT;
import javax.ws.rs.Path;
import javax.ws.rs.Produces;
@@ -63,8 +65,6 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-import static java.nio.charset.StandardCharsets.UTF_8;
-
@Path("/unpack")
public class UnpackerResource {
public static final String TEXT_FILENAME = "__TEXT__";
@@ -93,7 +93,7 @@ public class UnpackerResource {
@Context HttpHeaders httpHeaders,
@Context UriInfo info
) throws Exception {
- return process(is, httpHeaders, info, false);
+ return process(TikaResource.getInputStream(is, httpHeaders),
httpHeaders, info, false);
}
@Path("/all{id:(/.*)?}")
@@ -104,7 +104,7 @@ public class UnpackerResource {
@Context HttpHeaders httpHeaders,
@Context UriInfo info
) throws Exception {
- return process(is, httpHeaders, info, true);
+ return process(TikaResource.getInputStream(is, httpHeaders),
httpHeaders, info, true);
}
private Map<String, byte[]> process(
http://git-wip-us.apache.org/repos/asf/tika/blob/ce1fc372/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
----------------------------------------------------------------------
diff --git a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
index c9c9195..5235ff8 100644
--- a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
+++ b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
@@ -74,7 +74,9 @@ public abstract class CXFTestBase extends TikaTest {
@Before
public void setUp() {
this.tika = TikaConfig.getDefaultConfig();
- TikaResource.init(tika, new CommonsDigester(DIGESTER_READ_LIMIT,
CommonsDigester.DigestAlgorithm.MD5));
+ TikaResource.init(tika,
+ new CommonsDigester(DIGESTER_READ_LIMIT,
CommonsDigester.DigestAlgorithm.MD5),
+ new DefaultInputStreamFactory());
JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
setUpResources(sf);
setUpProviders(sf);