Author: nick
Date: Fri Apr 20 13:32:55 2012
New Revision: 1328370
URL: http://svn.apache.org/viewvc?rev=1328370&view=rev
Log:
TIKA-897 Detect XML files that start with the UTF-8 BOM, plus test
Added:
tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8-bom.xml
(with props)
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1328370&r1=1328369&r2=1328370&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
(original)
+++
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Fri Apr 20 13:32:55 2012
@@ -2940,6 +2940,9 @@
<match value="<?xml" type="string" offset="0"/>
<match value="<?XML" type="string" offset="0"/>
<match value="<!--" type="string" offset="0"/>
+ <!-- UTF-8 BOM -->
+ <match value="0xEFBBBF3C3F786D6C" type="string" offset="0"/>
+ <!-- UTF-16 LE/BE -->
<match value="0xFFFE3C003F0078006D006C00" type="string" offset="0"/>
<match value="0xFEFF003C003F0078006D006C" type="string" offset="0"/>
<!-- TODO: Add matches for the other possible XML encoding schemes -->
Modified:
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java?rev=1328370&r1=1328369&r2=1328370&view=diff
==============================================================================
---
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
(original)
+++
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
Fri Apr 20 13:32:55 2012
@@ -46,6 +46,7 @@ public class MimeDetectionTest extends T
testFile("text/html", "test.html");
testFile("application/xml", "test-iso-8859-1.xml");
testFile("application/xml", "test-utf8.xml");
+ testFile("application/xml", "test-utf8-bom.xml");
testFile("application/xml", "test-utf16le.xml");
testFile("application/xml", "test-utf16be.xml");
testFile("application/xml", "test-long-comment.xml");
Added:
tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8-bom.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8-bom.xml?rev=1328370&view=auto
==============================================================================
---
tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8-bom.xml
(added)
+++
tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8-bom.xml
Fri Apr 20 13:32:55 2012
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<test hello="world"/>
\ No newline at end of file
Propchange:
tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8-bom.xml
------------------------------------------------------------------------------
svn:mime-type = text/xml