Author: nick
Date: Fri Apr 20 13:32:55 2012
New Revision: 1328370

URL: http://svn.apache.org/viewvc?rev=1328370&view=rev
Log:
TIKA-897 Detect XML files that start with the UTF-8 BOM, plus test

Added:
    
tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8-bom.xml  
 (with props)
Modified:
    
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java

Modified: 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1328370&r1=1328369&r2=1328370&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
(original)
+++ 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
Fri Apr 20 13:32:55 2012
@@ -2940,6 +2940,9 @@
       <match value="&lt;?xml" type="string" offset="0"/>
       <match value="&lt;?XML" type="string" offset="0"/>
       <match value="&lt;!--" type="string" offset="0"/>
+      <!-- UTF-8 BOM -->
+      <match value="0xEFBBBF3C3F786D6C" type="string" offset="0"/>
+      <!-- UTF-16 LE/BE -->
       <match value="0xFFFE3C003F0078006D006C00" type="string" offset="0"/>
       <match value="0xFEFF003C003F0078006D006C" type="string" offset="0"/>
       <!-- TODO: Add matches for the other possible XML encoding schemes -->

Modified: 
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java?rev=1328370&r1=1328369&r2=1328370&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java 
(original)
+++ 
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java 
Fri Apr 20 13:32:55 2012
@@ -46,6 +46,7 @@ public class MimeDetectionTest extends T
         testFile("text/html", "test.html");
         testFile("application/xml", "test-iso-8859-1.xml");
         testFile("application/xml", "test-utf8.xml");
+        testFile("application/xml", "test-utf8-bom.xml");
         testFile("application/xml", "test-utf16le.xml");
         testFile("application/xml", "test-utf16be.xml");
         testFile("application/xml", "test-long-comment.xml");

Added: 
tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8-bom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8-bom.xml?rev=1328370&view=auto
==============================================================================
--- 
tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8-bom.xml 
(added)
+++ 
tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8-bom.xml 
Fri Apr 20 13:32:55 2012
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<test hello="world"/>
\ No newline at end of file

Propchange: 
tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8-bom.xml
------------------------------------------------------------------------------
    svn:mime-type = text/xml


Reply via email to