Author: mattmann
Date: Sat May 9 18:26:42 2015
New Revision: 1678515
URL: http://svn.apache.org/r1678515
Log:
Fix for TIKA-1625 Add support to Tika Server for parsing remote file URLs and
for providing language detection contributed by junwei1229
<[email protected]> this closes #48.
Added:
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaUtils.java
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
Modified: tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1678515&r1=1678514&r2=1678515&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sat May 9 18:26:42 2015
@@ -1,5 +1,10 @@
Release 1.9 - Current Development
+ * Tika Server now allows for metadata extraction from remote
+ URLs and in addition it outputs the detected language as a
+ metadata field (TIKA-1625).
+
+
* OUTPUT_FILE_TOKEN not being replaced in ExternalParser
contributed by Pascal Essiembre (TIKA-1620).
Modified:
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java?rev=1678515&r1=1678514&r2=1678515&view=diff
==============================================================================
---
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
(original)
+++
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
Sat May 9 18:26:42 2015
@@ -36,6 +36,7 @@ import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.cxf.jaxrs.ext.multipart.Attachment;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.language.ProfilingHandler;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
@@ -65,7 +66,7 @@ public class MetadataResource {
@Produces({"text/csv", "application/json", "application/rdf+xml"})
public Response getMetadata(InputStream is, @Context HttpHeaders
httpHeaders, @Context UriInfo info) throws Exception {
return Response.ok(
- parseMetadata(is, httpHeaders.getRequestHeaders(),
info)).build();
+ parseMetadata(TikaUtils.getInputSteam(is, httpHeaders),
httpHeaders.getRequestHeaders(), info)).build();
}
/**
@@ -101,7 +102,7 @@ public class MetadataResource {
Response.Status defaultErrorResponse = Response.Status.BAD_REQUEST;
Metadata metadata = null;
try {
- metadata = parseMetadata(is, httpHeaders.getRequestHeaders(),
info);
+ metadata = parseMetadata(TikaUtils.getInputSteam(is, httpHeaders),
httpHeaders.getRequestHeaders(), info);
// once we've parsed the document successfully, we should use
NOT_FOUND
// if we did not see the field
defaultErrorResponse = Response.Status.NOT_FOUND;
@@ -131,7 +132,12 @@ public class MetadataResource {
//no need to pass parser for embedded document parsing
TikaResource.fillParseContext(context, httpHeaders, null);
TikaResource.logRequest(logger, info, metadata);
- TikaResource.parse(parser, logger, info.getPath(), is, new
DefaultHandler(), metadata, context);
+ TikaResource.parse(parser, logger, info.getPath(), is,
+ new ProfilingHandler() {
+ public void endDocument() {
+ metadata.set("language", getLanguage().getLanguage());
+ }},
+ metadata, context);
return metadata;
}
}
Modified:
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java?rev=1678515&r1=1678514&r2=1678515&view=diff
==============================================================================
---
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
(original)
+++
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
Sat May 9 18:26:42 2015
@@ -295,8 +295,8 @@ public class TikaResource {
@PUT
@Consumes("*/*")
@Produces("text/plain")
- public StreamingOutput getText(final InputStream is, @Context HttpHeaders
httpHeaders, @Context final UriInfo info) {
- return produceText(is, httpHeaders.getRequestHeaders(), info);
+ public StreamingOutput getText(final InputStream is, @Context HttpHeaders
httpHeaders, @Context final UriInfo info) throws IOException {
+ return produceText(TikaUtils.getInputSteam(is, httpHeaders),
httpHeaders.getRequestHeaders(), info);
}
public StreamingOutput produceText(final InputStream is,
MultivaluedMap<String, String> httpHeaders, final UriInfo info) {
Added:
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaUtils.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaUtils.java?rev=1678515&view=auto
==============================================================================
---
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaUtils.java
(added)
+++
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaUtils.java
Sat May 9 18:26:42 2015
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server.resource;
+
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+
+import javax.ws.rs.core.Context;
+import javax.ws.rs.core.HttpHeaders;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+
+public class TikaUtils {
+
+ public static InputStream getInputSteam(InputStream is, @Context
HttpHeaders httpHeaders) throws IOException {
+ String fileUrl = httpHeaders.getHeaderString("fileUrl");
+ if(is.available() == 0 && !"".equals(fileUrl)){
+ Metadata metadata = new Metadata();
+ return TikaInputStream.get(new URL(fileUrl), metadata);
+ }
+ return is;
+ }
+}