Author: tallison
Date: Mon Apr 6 13:18:59 2015
New Revision: 1671533
URL: http://svn.apache.org/r1671533
Log:
TIKA-1519 - don't allow potentially erroneous http-equiv Content-Type to
overwrite Content-Type in HtmlParser
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java?rev=1671533&r1=1671532&r2=1671533&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
Mon Apr 6 13:18:59 2015
@@ -62,12 +62,23 @@ public interface TikaCoreProperties {
/**
* Use this to store parse exception information in the Metadata object.
- */
- public static String TIKA_META_EXCEPTION_PREFIX =
TIKA_META_PREFIX+"EXCEPTION"+
- Metadata.NAMESPACE_PREFIX_DELIMITER;
- /**
- * @see DublinCore#FORMAT
- */
+ */
+ public static String TIKA_META_EXCEPTION_PREFIX =
TIKA_META_PREFIX+"EXCEPTION"+
+ Metadata.NAMESPACE_PREFIX_DELIMITER;
+
+ /**
+ * This is currently used to identify Content-Type that may be
+ * included within a document, such as in html documents
+ * (e.g. <meta http-equiv="content-type" content="text/html;
charset=UTF-8">)
+ , or the value might come from outside the document. This information
+ * may be faulty and should be treated only as a hint.
+ */
+ public static final Property CONTENT_TYPE_HINT =
+ Property.internalText(HttpHeaders.CONTENT_TYPE+"_Hint");
+
+ /**
+ * @see DublinCore#FORMAT
+ */
public static final Property FORMAT =
Property.composite(DublinCore.FORMAT,
new Property[] { Property.internalText(Metadata.FORMAT) });
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=1671533&r1=1671532&r2=1671533&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
Mon Apr 6 13:18:59 2015
@@ -16,6 +16,15 @@
*/
package org.apache.tika.parser.html;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
@@ -26,15 +35,6 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Locale;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
class HtmlHandler extends TextContentHandler {
// List of attributes that need to be resolved.
@@ -160,11 +160,12 @@ class HtmlHandler extends TextContentHan
metadata.set("ICBM", value);
}
} else if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)){
+ //don't overwrite Metadata.CONTENT_TYPE!
MediaType type = MediaType.parse(value);
if (type != null) {
- metadata.set(Metadata.CONTENT_TYPE, type.toString());
+ metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT,
type.toString());
} else {
- metadata.set(Metadata.CONTENT_TYPE, value);
+ metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, value);
}
} else {
metadata.add(name, value);
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=1671533&r1=1671532&r2=1671533&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Mon Apr 6 13:18:59 2015
@@ -1016,5 +1016,41 @@ public class HtmlParserTest {
//Expecting first title to be set in meta data and second one to be
ignored.
assertEquals("Simple Content", metadata.get(TikaCoreProperties.TITLE));
}
-
+
+ @Test
+ public void testMisleadingMetaContentTypeTags() throws Exception {
+ //TIKA-1519
+
+ String test = "<html><head><meta http-equiv=\"content-type\"
content=\"text/html; charset=UTF-ELEVEN\">"+
+ "</head><title>title</title><body>body</body></html>";
+ Metadata metadata = new Metadata();
+
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes(IOUtils.UTF_8)),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("text/html; charset=UTF-ELEVEN",
metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
+ assertEquals("text/html; charset=ISO-8859-1",
metadata.get(Metadata.CONTENT_TYPE));
+
+ test = "<html><head><meta http-equiv=\"content-type\"
content=\"application/pdf\">"+
+ "</head><title>title</title><body>body</body></html>";
+ metadata = new Metadata();
+
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes(IOUtils.UTF_8)),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("application/pdf",
metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
+ assertEquals("text/html; charset=ISO-8859-1",
metadata.get(Metadata.CONTENT_TYPE));
+
+ //test two content values
+ test = "<html><head><meta http-equiv=\"content-type\"
content=\"application/pdf\" content=\"application/ms-word\">"+
+ "</head><title>title</title><body>body</body></html>";
+ metadata = new Metadata();
+
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes(IOUtils.UTF_8)),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("application/ms-word",
metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
+ assertEquals("text/html; charset=ISO-8859-1",
metadata.get(Metadata.CONTENT_TYPE));
+
+ }
}