Author: jukka
Date: Wed Feb 10 16:06:38 2010
New Revision: 908560
URL: http://svn.apache.org/viewvc?rev=908560&view=rev
Log:
TIKA-377: Error parsing HTML partial with AutoDetect parser
Recognize both upper and lower case versions of HTML tag soup.
Modified:
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Modified:
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=908560&r1=908559&r2=908560&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
(original)
+++
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Wed Feb 10 16:06:38 2010
@@ -3551,9 +3551,13 @@
bad HTML, unfortunately.
-->
<root-XML localName="html"/>
+ <root-XML localName="HTML"/>
<root-XML localName="link"/>
+ <root-XML localName="LINK"/>
<root-XML localName="body"/>
+ <root-XML localName="BODY"/>
<root-XML localName="p"/>
+ <root-XML localName="P"/>
<magic priority="50">
<match value="<!DOCTYPE HTML" type="string" offset="0:64"/>
<match value="<!doctype html" type="string" offset="0:64"/>