Author: kkrugler
Date: Tue May 28 14:25:46 2013
New Revision: 1486936
URL: http://svn.apache.org/r1486936
Log:
TIKA-1102: detect fragment that starts with <div> or <DIV> as HTML.
Added:
tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1486936&r1=1486935&r2=1486936&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
(original)
+++
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Tue May 28 14:25:46 2013
@@ -4533,6 +4533,8 @@
<match value="<HTML" type="string" offset="0:64"/>
<match value="<BODY" type="string" offset="0"/>
<match value="<body" type="string" offset="0"/>
+ <match value="<DIV" type="string" offset="0"/>
+ <match value="<div" type="string" offset="0"/>
<match value="<TITLE" type="string" offset="0"/>
<match value="<title" type="string" offset="0"/>
<match value="<h1" type="string" offset="0"/>
Modified:
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java?rev=1486936&r1=1486935&r2=1486936&view=diff
==============================================================================
---
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
(original)
+++
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
Tue May 28 14:25:46 2013
@@ -63,6 +63,8 @@ public class MimeDetectionTest extends T
testFile("text/html", "evilhtml.html");
// add another evil html test from TIKA-357
testFile("text/html", "testlargerbuffer.html");
+ // test fragment of HTML with <div> (TIKA-1102)
+ testFile("text/html", "htmlfragment");
}
public void testByteOrderMark() throws Exception {
Added: tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment?rev=1486936&view=auto
==============================================================================
--- tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment
(added)
+++ tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment
Tue May 28 14:25:46 2013
@@ -0,0 +1,18 @@
+<div id="leftcol">
+ <ul>
+ <li><a href="/mission/sec/sec.html"> Security and Information Sciences
Home ›</a> </li>
+ <li><a
href="/mission/sec/publications/-publications.html">Publications ›</a>
</li>
+ <li><a
href="/mission/sec/corpora/corpora.html">Corpora ›</a> </li>
+ <li><a href="/mission/sec/softwaretools/tools.html">Software
Tools ›</a></li>
+ <li><a href="/mission/sec/CSO/CSO.html"> Systems and
Operations ›</a>
+ <ul>
+ <li><a
href="/mission/sec/publications/-publications.html">Publications
›</a></li>
+ <li><a
href="/mission/sec/CSO/biographies/CSObios.html">Biographies ›</a></li>
+ </ul>
+ </li>
+ <li><a href="/mission/sec/CST/CST.html"> Systems and
Technology ›</a> </li>
+ <li><a href="/mission/sec/CSA/CSA.html"> System
Assessments ›</a> </li>
+ <li><a href="/mission/sec/HLT/HLT.html">Human Language
Technology ›</a>
+<li><a href="/mission/sec/computing/computing.html">Computing and
Analytics ›</a></li>
+ </ul>
+</div>