Repository: tika Updated Branches: refs/heads/master 3a5431e20 -> d612aea85
TIKA-2081 -- add fileUrl back into tika-server Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d612aea8 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d612aea8 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d612aea8 Branch: refs/heads/master Commit: d612aea850060c7d77124f79c525f68032a11031 Parents: 3a5431e Author: tballison <[email protected]> Authored: Fri Sep 23 09:49:28 2016 -0400 Committer: tballison <[email protected]> Committed: Fri Sep 23 09:49:28 2016 -0400 ---------------------------------------------------------------------- .../tika/server/DefaultInputStreamFactory.java | 33 +++++++++++++ .../apache/tika/server/InputStreamFactory.java | 34 +++++++++++++ .../org/apache/tika/server/TikaServerCli.java | 25 +++++++++- .../server/URLEnabledInputStreamFactory.java | 52 ++++++++++++++++++++ .../tika/server/resource/DetectorResource.java | 2 +- .../tika/server/resource/MetadataResource.java | 9 ++-- .../resource/RecursiveMetadataResource.java | 6 +-- .../tika/server/resource/TikaResource.java | 20 ++++++-- .../tika/server/resource/UnpackerResource.java | 8 +-- .../org/apache/tika/server/CXFTestBase.java | 4 +- 10 files changed, 174 insertions(+), 19 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/d612aea8/tika-server/src/main/java/org/apache/tika/server/DefaultInputStreamFactory.java ---------------------------------------------------------------------- diff --git a/tika-server/src/main/java/org/apache/tika/server/DefaultInputStreamFactory.java b/tika-server/src/main/java/org/apache/tika/server/DefaultInputStreamFactory.java new file mode 100644 index 0000000..a2df856 --- /dev/null +++ b/tika-server/src/main/java/org/apache/tika/server/DefaultInputStreamFactory.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.server; + +import javax.ws.rs.core.HttpHeaders; +import java.io.IOException; +import java.io.InputStream; + +/** + * Passthrough -- returns InputStream as is + */ +public class DefaultInputStreamFactory implements InputStreamFactory { + + @Override + public InputStream getInputSteam(InputStream is, HttpHeaders httpHeaders) throws IOException { + return is; + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/d612aea8/tika-server/src/main/java/org/apache/tika/server/InputStreamFactory.java ---------------------------------------------------------------------- diff --git a/tika-server/src/main/java/org/apache/tika/server/InputStreamFactory.java b/tika-server/src/main/java/org/apache/tika/server/InputStreamFactory.java new file mode 100644 index 0000000..27e7f86 --- /dev/null +++ b/tika-server/src/main/java/org/apache/tika/server/InputStreamFactory.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.server; + +import javax.ws.rs.core.HttpHeaders; +import java.io.IOException; +import java.io.InputStream; + +/** + * Interface to allow for custom/consistent creation of InputStream + * <p> + * This factory is used statically in TikaResource. Make sure not + * to hold instance state in implementations. + */ +public interface InputStreamFactory { + + public InputStream getInputSteam(InputStream is, HttpHeaders httpHeaders) throws IOException; + +} http://git-wip-us.apache.org/repos/asf/tika/blob/d612aea8/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java ---------------------------------------------------------------------- diff --git a/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java b/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java index 4804398..831a6d3 100644 --- a/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java +++ b/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java @@ -68,6 +68,13 @@ public class TikaServerCli { new HashSet<String>(Arrays.asList("debug", "info")); private static final Log logger = LogFactory.getLog(TikaServerCli.class); + private static final String FILE_URL_WARNING = + "WARNING: You have chosen to run tika-server with fileUrl enabled.\n"+ + "Whoever has access to your service now has the same read permissions\n"+ + "as tika-server. Users could request and receive a sensitive file from your\n" + + "drive or a webpage from your intranet. See CVE-2015-3271.\n"+ + "Please make sure you know what you are doing."; + private static Options getOptions() { Options options = new Options(); options.addOption("C", "cors", true, "origin allowed to make CORS requests (default=NONE)\nall allowed if \"all\""); @@ -79,6 +86,8 @@ public class TikaServerCli { options.addOption("l", "log", true, "request URI log level ('debug' or 'info')"); options.addOption("s", "includeStack", false, "whether or not to return a stack trace\nif there is an exception during 'parse'"); options.addOption("?", "help", false, "this help message"); + options.addOption("enable-unsecure-features", false, "this is required to enable fileUrl."); + options.addOption("enable-fileUrl", false, "allows user to pass in fileUrl instead of InputStream."); return options; } @@ -166,8 +175,22 @@ public class TikaServerCli { CommonsDigester.parse(line.getOptionValue("digest"))); } + if (line.hasOption("enable-fileUrl") && + !line.hasOption("enable-unsecure-features")) { + System.err.println("If you want to enable fileUrl, you must also acknowledge the security risks\n"+ + "by including --enable-unsecure-features. See CVE-2015-3271."); + System.exit(-1); + } + InputStreamFactory inputStreamFactory = null; + if (line.hasOption("enable-fileUrl") && + line.hasOption("enable-unsecure-features")) { + inputStreamFactory = new URLEnabledInputStreamFactory(); + System.out.println(FILE_URL_WARNING); + } else { + inputStreamFactory = new DefaultInputStreamFactory(); + } - TikaResource.init(tika, digester); + TikaResource.init(tika, digester, inputStreamFactory); JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean(); List<ResourceProvider> rCoreProviders = new ArrayList<ResourceProvider>(); http://git-wip-us.apache.org/repos/asf/tika/blob/d612aea8/tika-server/src/main/java/org/apache/tika/server/URLEnabledInputStreamFactory.java ---------------------------------------------------------------------- diff --git a/tika-server/src/main/java/org/apache/tika/server/URLEnabledInputStreamFactory.java b/tika-server/src/main/java/org/apache/tika/server/URLEnabledInputStreamFactory.java new file mode 100644 index 0000000..10d4180 --- /dev/null +++ b/tika-server/src/main/java/org/apache/tika/server/URLEnabledInputStreamFactory.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.server; + +import javax.ws.rs.core.HttpHeaders; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; + +import org.apache.tika.io.TikaInputStream; + +/** + * This class looks for "fileUrl" in the http header. If it is not null + * and not empty, this will return a new TikaInputStream from the URL. + * <p> + * This is not meant to be used in place of a robust, responsible crawler. Rather, this + * is a convenience factory. + * <p> + * <em>WARNING:</em> Unless you carefully lock down access to the server, + * whoever has access to this service will have the read access of the server. + * In short, anyone with access to this service could request and get + * "file:///etc/supersensitive_file_dont_read.txt". Or, if your server has access + * to your intranet, and you let the public hit this service, they will now + * have access to your intranet. + * See <a href="https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2015-3271">CVE-2015-3271</a> + * + */ +public class URLEnabledInputStreamFactory implements InputStreamFactory { + + @Override + public InputStream getInputSteam(InputStream is, HttpHeaders httpHeaders) throws IOException { + String fileUrl = httpHeaders.getHeaderString("fileUrl"); + if(fileUrl != null && !"".equals(fileUrl)){ + return TikaInputStream.get(new URL(fileUrl)); + } + return is; + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/d612aea8/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java ---------------------------------------------------------------------- diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java index f1f5a29..9f19ad6 100644 --- a/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java +++ b/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java @@ -46,7 +46,7 @@ public class DetectorResource { public String detect(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) { Metadata met = new Metadata(); - TikaInputStream tis = TikaInputStream.get(is); + TikaInputStream tis = TikaInputStream.get(TikaResource.getInputStream(is, httpHeaders)); String filename = TikaResource.detectFilename(httpHeaders .getRequestHeaders()); logger.info("Detecting media type for Filename: " + filename); http://git-wip-us.apache.org/repos/asf/tika/blob/d612aea8/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java ---------------------------------------------------------------------- diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java index 89d35e8..e5e5a1f 100644 --- a/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java +++ b/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java @@ -17,9 +17,6 @@ package org.apache.tika.server.resource; -import java.io.IOException; -import java.io.InputStream; - import javax.ws.rs.Consumes; import javax.ws.rs.POST; import javax.ws.rs.PUT; @@ -31,6 +28,8 @@ import javax.ws.rs.core.HttpHeaders; import javax.ws.rs.core.MultivaluedMap; import javax.ws.rs.core.Response; import javax.ws.rs.core.UriInfo; +import java.io.IOException; +import java.io.InputStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -58,7 +57,7 @@ public class MetadataResource { @Produces({"text/csv", "application/json", "application/rdf+xml"}) public Response getMetadata(InputStream is, @Context HttpHeaders httpHeaders, @Context UriInfo info) throws Exception { return Response.ok( - parseMetadata(is, httpHeaders.getRequestHeaders(), info)).build(); + parseMetadata(TikaResource.getInputStream(is, httpHeaders), httpHeaders.getRequestHeaders(), info)).build(); } /** @@ -94,7 +93,7 @@ public class MetadataResource { Response.Status defaultErrorResponse = Response.Status.BAD_REQUEST; Metadata metadata = null; try { - metadata = parseMetadata(is, httpHeaders.getRequestHeaders(), info); + metadata = parseMetadata(TikaResource.getInputStream(is, httpHeaders), httpHeaders.getRequestHeaders(), info); // once we've parsed the document successfully, we should use NOT_FOUND // if we did not see the field defaultErrorResponse = Response.Status.NOT_FOUND; http://git-wip-us.apache.org/repos/asf/tika/blob/d612aea8/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java ---------------------------------------------------------------------- diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java index aa4e0ab..b967f8b 100644 --- a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java +++ b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java @@ -17,8 +17,6 @@ package org.apache.tika.server.resource; -import java.io.InputStream; - import javax.ws.rs.Consumes; import javax.ws.rs.POST; import javax.ws.rs.PUT; @@ -30,6 +28,7 @@ import javax.ws.rs.core.HttpHeaders; import javax.ws.rs.core.MultivaluedMap; import javax.ws.rs.core.Response; import javax.ws.rs.core.UriInfo; +import java.io.InputStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -118,7 +117,8 @@ public class RecursiveMetadataResource { @PathParam(HANDLER_TYPE_PARAM) String handlerTypeName ) throws Exception { return Response.ok( - parseMetadata(is, httpHeaders.getRequestHeaders(), info, handlerTypeName)).build(); + parseMetadata(TikaResource.getInputStream(is, httpHeaders), + httpHeaders.getRequestHeaders(), info, handlerTypeName)).build(); } private MetadataList parseMetadata(InputStream is, http://git-wip-us.apache.org/repos/asf/tika/blob/d612aea8/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java ---------------------------------------------------------------------- diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java index 566203a..c5150a1 100644 --- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java +++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java @@ -73,6 +73,7 @@ import org.apache.tika.parser.pdf.PDFParserConfig; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ExpandedTitleContentHandler; import org.apache.tika.sax.RichTextContentHandler; +import org.apache.tika.server.InputStreamFactory; import org.apache.tika.server.TikaServerParseException; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -88,10 +89,13 @@ public class TikaResource { private static TikaConfig tikaConfig; private static DigestingParser.Digester digester = null; + private static InputStreamFactory inputStreamFactory = null; - public static void init(TikaConfig config, DigestingParser.Digester digestr) { + public static void init(TikaConfig config, DigestingParser.Digester digestr, + InputStreamFactory iSF) { tikaConfig = config; digester = digestr; + inputStreamFactory = iSF; } static { @@ -172,6 +176,14 @@ public class TikaResource { } } + public static InputStream getInputStream(InputStream is, HttpHeaders headers) { + try { + return inputStreamFactory.getInputSteam(is, headers); + } catch (IOException e) { + throw new TikaServerParseException(e); + } + } + /** * Utility method to set a property on a class via reflection. * @@ -337,7 +349,7 @@ public class TikaResource { @Consumes("*/*") @Produces("text/plain") public StreamingOutput getText(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) { - return produceText(is, httpHeaders.getRequestHeaders(), info); + return produceText(getInputStream(is, httpHeaders), httpHeaders.getRequestHeaders(), info); } public StreamingOutput produceText(final InputStream is, MultivaluedMap<String, String> httpHeaders, final UriInfo info) { @@ -375,7 +387,7 @@ public class TikaResource { @Consumes("*/*") @Produces("text/html") public StreamingOutput getHTML(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) { - return produceOutput(is, httpHeaders.getRequestHeaders(), info, "html"); + return produceOutput(getInputStream(is, httpHeaders), httpHeaders.getRequestHeaders(), info, "html"); } @POST @@ -390,7 +402,7 @@ public class TikaResource { @Consumes("*/*") @Produces("text/xml") public StreamingOutput getXML(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) { - return produceOutput(is, httpHeaders.getRequestHeaders(), info, "xml"); + return produceOutput(getInputStream(is, httpHeaders), httpHeaders.getRequestHeaders(), info, "xml"); } private StreamingOutput produceOutput(final InputStream is, final MultivaluedMap<String, String> httpHeaders, http://git-wip-us.apache.org/repos/asf/tika/blob/d612aea8/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java ---------------------------------------------------------------------- diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java index 8ee516e..383af98 100644 --- a/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java +++ b/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java @@ -17,6 +17,8 @@ package org.apache.tika.server.resource; +import static java.nio.charset.StandardCharsets.UTF_8; + import javax.ws.rs.PUT; import javax.ws.rs.Path; import javax.ws.rs.Produces; @@ -63,8 +65,6 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -import static java.nio.charset.StandardCharsets.UTF_8; - @Path("/unpack") public class UnpackerResource { public static final String TEXT_FILENAME = "__TEXT__"; @@ -93,7 +93,7 @@ public class UnpackerResource { @Context HttpHeaders httpHeaders, @Context UriInfo info ) throws Exception { - return process(is, httpHeaders, info, false); + return process(TikaResource.getInputStream(is, httpHeaders), httpHeaders, info, false); } @Path("/all{id:(/.*)?}") @@ -104,7 +104,7 @@ public class UnpackerResource { @Context HttpHeaders httpHeaders, @Context UriInfo info ) throws Exception { - return process(is, httpHeaders, info, true); + return process(TikaResource.getInputStream(is, httpHeaders), httpHeaders, info, true); } private Map<String, byte[]> process( http://git-wip-us.apache.org/repos/asf/tika/blob/d612aea8/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java ---------------------------------------------------------------------- diff --git a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java index 770b678..2a09968 100644 --- a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java +++ b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java @@ -82,7 +82,9 @@ public abstract class CXFTestBase { @Before public void setUp() { this.tika = TikaConfig.getDefaultConfig(); - TikaResource.init(tika, new CommonsDigester(DIGESTER_READ_LIMIT, CommonsDigester.DigestAlgorithm.MD5)); + TikaResource.init(tika, + new CommonsDigester(DIGESTER_READ_LIMIT, CommonsDigester.DigestAlgorithm.MD5), + new DefaultInputStreamFactory()); JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean(); setUpResources(sf); setUpProviders(sf);
