Author: tallison
Date: Mon Apr  6 13:18:59 2015
New Revision: 1671533

URL: http://svn.apache.org/r1671533
Log:
TIKA-1519 - don't allow potentially erroneous http-equiv Content-Type to 
overwrite Content-Type in HtmlParser

Modified:
    
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java?rev=1671533&r1=1671532&r2=1671533&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
 (original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
 Mon Apr  6 13:18:59 2015
@@ -62,12 +62,23 @@ public interface TikaCoreProperties {
 
     /**
      * Use this to store parse exception information in the Metadata object.
-     */
-    public static String TIKA_META_EXCEPTION_PREFIX = 
TIKA_META_PREFIX+"EXCEPTION"+
-            Metadata.NAMESPACE_PREFIX_DELIMITER;
-    /**
-     * @see DublinCore#FORMAT
-     */
+     */
+    public static String TIKA_META_EXCEPTION_PREFIX = 
TIKA_META_PREFIX+"EXCEPTION"+
+            Metadata.NAMESPACE_PREFIX_DELIMITER;
+
+    /**
+     * This is currently used to identify Content-Type that may be
+     * included within a document, such as in html documents
+     * (e.g. <meta http-equiv="content-type" content="text/html; 
charset=UTF-8">)
+     , or the value might come from outside the document.  This information
+     * may be faulty and should be treated only as a hint.
+     */
+    public static final Property CONTENT_TYPE_HINT =
+            Property.internalText(HttpHeaders.CONTENT_TYPE+"_Hint");
+
+    /**
+     * @see DublinCore#FORMAT
+     */
     public static final Property FORMAT = 
Property.composite(DublinCore.FORMAT, 
             new Property[] { Property.internalText(Metadata.FORMAT) });
     

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=1671533&r1=1671532&r2=1671533&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
 Mon Apr  6 13:18:59 2015
@@ -16,6 +16,15 @@
  */
 package org.apache.tika.parser.html;
 
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
@@ -26,15 +35,6 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Locale;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
 class HtmlHandler extends TextContentHandler {
 
     // List of attributes that need to be resolved.
@@ -160,11 +160,12 @@ class HtmlHandler extends TextContentHan
                 metadata.set("ICBM", value);
             }
         } else if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)){
+            //don't overwrite Metadata.CONTENT_TYPE!
             MediaType type = MediaType.parse(value);
             if (type != null) {
-                metadata.set(Metadata.CONTENT_TYPE, type.toString());
+                metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, 
type.toString());
             } else {
-                metadata.set(Metadata.CONTENT_TYPE, value);
+                metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, value);
             }
         } else {
             metadata.add(name, value);

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=1671533&r1=1671532&r2=1671533&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 Mon Apr  6 13:18:59 2015
@@ -1016,5 +1016,41 @@ public class HtmlParserTest {
         //Expecting first title to be set in meta data and second one to be 
ignored.
         assertEquals("Simple Content", metadata.get(TikaCoreProperties.TITLE));
     }
-    
+
+    @Test
+    public void testMisleadingMetaContentTypeTags() throws Exception {
+        //TIKA-1519
+
+        String test = "<html><head><meta http-equiv=\"content-type\" 
content=\"text/html; charset=UTF-ELEVEN\">"+
+                "</head><title>title</title><body>body</body></html>";
+        Metadata metadata = new Metadata();
+
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(IOUtils.UTF_8)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("text/html; charset=UTF-ELEVEN", 
metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
+        assertEquals("text/html; charset=ISO-8859-1", 
metadata.get(Metadata.CONTENT_TYPE));
+
+        test = "<html><head><meta http-equiv=\"content-type\" 
content=\"application/pdf\">"+
+                "</head><title>title</title><body>body</body></html>";
+        metadata = new Metadata();
+
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(IOUtils.UTF_8)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("application/pdf", 
metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
+        assertEquals("text/html; charset=ISO-8859-1", 
metadata.get(Metadata.CONTENT_TYPE));
+
+        //test two content values
+        test = "<html><head><meta http-equiv=\"content-type\" 
content=\"application/pdf\" content=\"application/ms-word\">"+
+                "</head><title>title</title><body>body</body></html>";
+        metadata = new Metadata();
+
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(IOUtils.UTF_8)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("application/ms-word", 
metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
+        assertEquals("text/html; charset=ISO-8859-1", 
metadata.get(Metadata.CONTENT_TYPE));
+
+    }
 }


Reply via email to