Author: kkrugler
Date: Tue May 28 14:25:46 2013
New Revision: 1486936

URL: http://svn.apache.org/r1486936
Log:
TIKA-1102: detect fragment that starts with <div> or <DIV> as HTML.

Added:
    tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment
Modified:
    
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java

Modified: 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1486936&r1=1486935&r2=1486936&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
(original)
+++ 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
Tue May 28 14:25:46 2013
@@ -4533,6 +4533,8 @@
       <match value="&lt;HTML" type="string" offset="0:64"/>
       <match value="&lt;BODY" type="string" offset="0"/>
       <match value="&lt;body" type="string" offset="0"/>
+      <match value="&lt;DIV" type="string" offset="0"/>
+      <match value="&lt;div" type="string" offset="0"/>
       <match value="&lt;TITLE" type="string" offset="0"/>
       <match value="&lt;title" type="string" offset="0"/>
       <match value="&lt;h1" type="string" offset="0"/>

Modified: 
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java?rev=1486936&r1=1486935&r2=1486936&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java 
(original)
+++ 
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java 
Tue May 28 14:25:46 2013
@@ -63,6 +63,8 @@ public class MimeDetectionTest extends T
         testFile("text/html", "evilhtml.html");
         // add another evil html test from TIKA-357
         testFile("text/html", "testlargerbuffer.html");
+        // test fragment of HTML with <div> (TIKA-1102)
+        testFile("text/html", "htmlfragment");
     }
 
     public void testByteOrderMark() throws Exception {

Added: tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment?rev=1486936&view=auto
==============================================================================
--- tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment 
(added)
+++ tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment 
Tue May 28 14:25:46 2013
@@ -0,0 +1,18 @@
+<div id="leftcol">
+         <ul>
+        <li><a href="/mission/sec/sec.html"> Security and Information Sciences 
Home&nbsp;&rsaquo;</a>        </li>
+        <li><a 
href="/mission/sec/publications/-publications.html">Publications&nbsp;&rsaquo;</a>
 </li>
+        <li><a 
href="/mission/sec/corpora/corpora.html">Corpora&nbsp;&rsaquo;</a> </li>
+        <li><a href="/mission/sec/softwaretools/tools.html">Software 
Tools&nbsp;&rsaquo;</a></li>
+        <li><a href="/mission/sec/CSO/CSO.html"> Systems and 
Operations&nbsp;&rsaquo;</a>
+          <ul>
+            <li><a 
href="/mission/sec/publications/-publications.html">Publications 
&rsaquo;</a></li>
+            <li><a 
href="/mission/sec/CSO/biographies/CSObios.html">Biographies&nbsp;&rsaquo;</a></li>
+          </ul>
+        </li>
+        <li><a href="/mission/sec/CST/CST.html"> Systems and 
Technology&nbsp;&rsaquo;</a> </li>
+        <li><a href="/mission/sec/CSA/CSA.html"> System 
Assessments&nbsp;&rsaquo;</a> </li>
+           <li><a href="/mission/sec/HLT/HLT.html">Human Language 
Technology&nbsp;&rsaquo;</a>
+<li><a href="/mission/sec/computing/computing.html">Computing and 
Analytics&nbsp;&rsaquo;</a></li>
+  </ul>
+</div>


Reply via email to