[ 
https://issues.apache.org/jira/browse/TIKA-2648?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16568369#comment-16568369
 ] 

ASF GitHub Bot commented on TIKA-2648:
--------------------------------------

tballison closed pull request #236: TIKA-2648 : detect interpreted server-side 
scripting languages
URL: https://github.com/apache/tika/pull/236
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java 
b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
index b4d651e42..d52c20b3d 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
@@ -110,6 +110,12 @@ public static boolean isValid(String name) {
      */
     private List<String> extensions = null;
 
+    /**
+     * Whether this mime-type is used for server-side scripts,
+     * and thus cannot reliably be used for filename-based type detection
+     */
+    private boolean isInterpreted = false;
+
     /**
      * Creates a media type with the give name and containing media type
      * registry. The name is expected to be valid and normalized to lower
@@ -302,6 +308,17 @@ public boolean matches(byte[] data) {
         return matchesMagic(data);
     }
 
+    /**
+     * whether the type is used as a server-side scripting technology
+     */
+    boolean isInterpreted() {
+        return isInterpreted;
+    }
+
+    void setInterpreted(boolean interpreted) {
+        isInterpreted = interpreted;
+    }
+
     /**
      * Defines a RootXML description. RootXML is made of a localName and/or a
      * namespaceURI.
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java 
b/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
index 4acfe0155..501793e57 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
@@ -502,10 +502,13 @@ public MediaType detect(InputStream input, Metadata 
metadata)
         String resourceName = 
metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
         if (resourceName != null) {
             String name = null;
+            boolean isHttp = false;
 
             // Deal with a URI or a path name in as the resource  name
             try {
                 URI uri = new URI(resourceName);
+                String scheme = uri.getScheme();
+                isHttp = scheme != null && scheme.startsWith("http"); // http 
or https
                 String path = uri.getPath();
                 if (path != null) {
                     int slash = path.lastIndexOf('/');
@@ -519,11 +522,14 @@ public MediaType detect(InputStream input, Metadata 
metadata)
 
             if (name != null) {
                 MimeType hint = getMimeType(name);
-                
-                // If we have some types based on mime magic, try to specialise
-                //  and/or select the type based on that
-                // Otherwise, use the type identified from the name
-                possibleTypes = applyHint(possibleTypes, hint);
+
+                // For server-side scripting languages, we cannot rely on the 
filename to detect the mime type
+                if (!(isHttp && hint.isInterpreted())) {
+                    // If we have some types based on mime magic, try to 
specialise
+                    //  and/or select the type based on that
+                    // Otherwise, use the type identified from the name
+                    possibleTypes = applyHint(possibleTypes, hint);
+                }
             }
         }
 
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java 
b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
index 565aaf83d..50cc21b62 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
@@ -169,8 +169,11 @@ public void startElement(
         if (type == null) {
             if (MIME_TYPE_TAG.equals(qName)) {
                 String name = attributes.getValue(MIME_TYPE_TYPE_ATTR);
+                String interpretedAttr = attributes.getValue(INTERPRETED_ATTR);
+                boolean interpreted = "true".equals(interpretedAttr);
                 try {
                     type = types.forName(name);
+                    type.setInterpreted(interpreted);
                 } catch (MimeTypeException e) {
                     handleMimeError(name, e, qName, attributes);
                 }
diff --git 
a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java 
b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java
index 98bfee5cc..c77cc5c43 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java
@@ -27,6 +27,8 @@
 
     String MIME_TYPE_TYPE_ATTR = "type";
 
+    String INTERPRETED_ATTR = "interpreted";
+
     String ACRONYM_TAG = "acronym";
 
     String COMMENT_TAG = "_comment";
diff --git 
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 104cd2ce7..256c26fd4 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -5987,13 +5987,13 @@
     <sub-class-of type="text/plain"/>
   </mime-type>
 
-  <mime-type type="text/asp">
+  <mime-type type="text/asp" interpreted="true">
     <_comment>Active Server Page</_comment>
     <glob pattern="*.asp"/>
     <sub-class-of type="text/plain"/>
   </mime-type>
 
-  <mime-type type="text/aspdotnet">
+  <mime-type type="text/aspdotnet" interpreted="true">
     <_comment>ASP .NET</_comment>
     <glob pattern="*.aspx"/>
     <sub-class-of type="text/plain"/>
@@ -6414,7 +6414,7 @@
     <sub-class-of type="text/plain"/>
   </mime-type>
 
-  <mime-type type="text/x-cgi">
+  <mime-type type="text/x-cgi" interpreted="true">
     <_comment>CGI script</_comment>
     <glob pattern="*.cgi"/>
     <sub-class-of type="text/plain"/>
@@ -6468,7 +6468,7 @@
     <sub-class-of type="text/plain"/>
   </mime-type>
 
-  <mime-type type="text/x-coldfusion">
+  <mime-type type="text/x-coldfusion" interpreted="true">
     <_comment>ColdFusion source code</_comment>
     <glob pattern="*.cfm"/>
     <glob pattern="*.cfml"/>
@@ -6584,7 +6584,7 @@
     <sub-class-of type="text/plain"/>
   </mime-type>
 
-  <mime-type type="text/x-jsp">
+  <mime-type type="text/x-jsp" interpreted="true">
     <_comment>Java Server Page</_comment>
     <alias type="application/x-httpd-jsp"/>
     <sub-class-of type="text/plain"/>
@@ -6712,7 +6712,7 @@
     <sub-class-of type="text/plain"/>
   </mime-type>
 
-  <mime-type type="text/x-php">
+  <mime-type type="text/x-php" interpreted="true">
     <_comment>PHP script</_comment>
     <magic priority="50">
       <match value="&lt;?php" type="string" offset="0"/>
diff --git a/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java 
b/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java
index 89287270c..df51d4525 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java
@@ -23,6 +23,7 @@
 
 import org.junit.Test;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
@@ -89,6 +90,7 @@ public void testCustomReader() throws Exception {
     assertEquals(1, reader.ignorePatterns.size());
     assertEquals(another.toString()+">>*"+hello.getExtension(), 
         reader.ignorePatterns.get(0));
+    assertTrue("Server-side script type not detected", 
another.isInterpreted());
     
     //System.out.println( mimeTypes.getMediaTypeRegistry().getTypes() );
   }
diff --git 
a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java 
b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
index 3e1d52bc6..c0f110938 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
@@ -16,22 +16,20 @@
  */
 package org.apache.tika.mime;
 
-import static java.nio.charset.StandardCharsets.UTF_16BE;
-import static java.nio.charset.StandardCharsets.UTF_16LE;
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Before;
+import org.junit.Test;
 
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
 
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.junit.Before;
-import org.junit.Test;
+import static java.nio.charset.StandardCharsets.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 public class MimeDetectionTest {
 
@@ -82,6 +80,21 @@ public void testDetection() throws Exception {
         testFile("application/dif+xml", "brwNIMS_2014.dif");
     }
 
+    @Test
+    public void testDetectionWithoutContent() throws IOException {
+        testUrlWithoutContent("text/html", "test.html");
+        testUrlWithoutContent("text/html", "http://test.com/test.html";);
+        testUrlWithoutContent("text/plain", "http://test.com/test.txt";);
+
+        // In case the url contains a filename referencing a server-side 
scripting language,
+        // it gives us no clue concerning the actual mime type of the response
+        testUrlWithoutContent("application/octet-stream", 
"http://test.com/test.php";);
+        testUrlWithoutContent("application/octet-stream", 
"http://test.com/test.cgi";);
+        testUrlWithoutContent("application/octet-stream", 
"http://test.com/test.jsp";);
+        // But in case the protocol is not http or https, the script is 
probably not interpreted
+        testUrlWithoutContent("text/x-php", "ftp://test.com/test.php";);
+    }
+
     @Test
     public void testByteOrderMark() throws Exception {
         assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
@@ -136,6 +149,13 @@ private void testUrlOnly(String expected, String url) 
throws IOException{
         testStream(expected, url, in);
     }
 
+    private void testUrlWithoutContent(String expected, String url) throws 
IOException {
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, url);
+        String mime = this.mimeTypes.detect(null, metadata).toString();
+        assertEquals(url + " is not properly detected using only resource 
name", expected, mime);
+    }
+
     private void testUrl(String expected, String url, String file) throws 
IOException{
         InputStream in = getClass().getResourceAsStream(file);
         testStream(expected, url, in);
diff --git 
a/tika-core/src/test/resources/org/apache/tika/mime/custom-mimetypes2.xml 
b/tika-core/src/test/resources/org/apache/tika/mime/custom-mimetypes2.xml
index 2001d5926..92d70cb65 100644
--- a/tika-core/src/test/resources/org/apache/tika/mime/custom-mimetypes2.xml
+++ b/tika-core/src/test/resources/org/apache/tika/mime/custom-mimetypes2.xml
@@ -16,7 +16,7 @@
   limitations under the License.
 -->
 <mime-info>
-  <mime-type type="another/world-file">
+  <mime-type type="another/world-file" interpreted="true">
      <hello>kittens</hello>
      <glob pattern="*.hello.world" /> <!-- Will collide with 
'hello/world-file'  -->
      <sub-class-of type="hello/world" />


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> mime detection based on resource name detects resources as "text/x-php" 
> instead of "text/html" 
> -----------------------------------------------------------------------------------------------
>
>                 Key: TIKA-2648
>                 URL: https://issues.apache.org/jira/browse/TIKA-2648
>             Project: Tika
>          Issue Type: Bug
>            Reporter: Gerard Bouchar
>            Priority: Major
>
> When using tika to detect a mime type given only an URL containing ".php" and 
> a content-type hint of "text/html", it guesses "text/x-php", whereas one 
> could expect "text/html".
> {code}
> TikaConfig tika = new TikaConfig();
> Metadata metadata = new Metadata();
> String url = "https://www.facebook.com/home.php";;
> metadata.set(Metadata.RESOURCE_NAME_KEY, url);
> metadata.set(Metadata.CONTENT_TYPE, "text/html");
> MediaType type = tika.getDetector().detect(null, metadata);
> System.out.println(url + " is of type " + type.toString());
> // Prints https://www.facebook.com/home.php is of type text/x-php
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to