Author: nick
Date: Tue Jun  2 13:15:21 2015
New Revision: 1683101

URL: http://svn.apache.org/r1683101
Log:
Bibtex entries are case insensitive, and might start with a comment, so tweak 
magic and add a test file. (Spotted in govdocs1)

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testBIBTEX.bib
Modified:
    
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java

Modified: 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1683101&r1=1683100&r2=1683101&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
(original)
+++ 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
Tue Jun  2 13:15:21 2015
@@ -487,7 +487,7 @@
       <match value="\xef\xbb\xbf%PDF-" type="string" offset="0"/>
     </magic>
     <magic priority="20">
-      <!-- Low priority match for %PDF near the start of the file -->
+      <!-- Low priority match for %PDF-#.# near the start of the file -->
       <!-- Can trigger false positives, so set the priority rather low here -->
       <match value="%PDF-1." type="string" offset="1:512"/>
       <match value="%PDF-2." type="string" offset="1:512"/>
@@ -2793,17 +2793,34 @@
       <match value="%%%\ \ " type="string" offset="73"/>
       <match value="%\ BibTeX\ standard\ bibliography\ " type="string" 
offset="0"/>
       <match value="%%%\ \ @BibTeX-style-file{" type="string" offset="73"/>
-      <match value="@article{" type="string" offset="0"/>
-      <match value="@book{" type="string" offset="0"/>
-      <match value="@inbook{" type="string" offset="0"/>
-      <match value="@incollection{" type="string" offset="0"/>
-      <match value="@inproceedings{" type="string" offset="0"/>
-      <match value="@manual{" type="string" offset="0"/>
-      <match value="@misc{" type="string" offset="0"/>
-      <match value="@preamble{" type="string" offset="0"/>
-      <match value="@phdthesis{" type="string" offset="0"/>
-      <match value="@techreport{" type="string" offset="0"/>
-      <match value="@unpublished{" type="string" offset="0"/>
+      <match value="@article{" type="stringignorecase" offset="0"/>
+      <match value="@book{" type="stringignorecase" offset="0"/>
+      <match value="@inbook{" type="stringignorecase" offset="0"/>
+      <match value="@incollection{" type="stringignorecase" offset="0"/>
+      <match value="@inproceedings{" type="stringignorecase" offset="0"/>
+      <match value="@manual{" type="stringignorecase" offset="0"/>
+      <match value="@misc{" type="stringignorecase" offset="0"/>
+      <match value="@preamble{" type="stringignorecase" offset="0"/>
+      <match value="@phdthesis{" type="stringignorecase" offset="0"/>
+      <match value="@string{" type="stringignorecase" offset="0"/>
+      <match value="@techreport{" type="stringignorecase" offset="0"/>
+      <match value="@unpublished{" type="stringignorecase" offset="0"/>
+    </magic>
+    <magic priority="30">
+      <match value="%" type="string" offset="0">
+         <match value="\n@article{" type="stringignorecase" offset="2:128"/>
+         <match value="\n@book{" type="stringignorecase" offset="2:128"/>
+         <match value="\n@inbook{" type="stringignorecase" offset="2:128"/>
+         <match value="\n@incollection{" type="stringignorecase" 
offset="2:128"/>
+         <match value="\n@inproceedings{" type="stringignorecase" 
offset="2:128"/>
+         <match value="\n@manual{" type="stringignorecase" offset="2:128"/>
+         <match value="\n@misc{" type="stringignorecase" offset="2:128"/>
+         <match value="\n@preamble{" type="stringignorecase" offset="2:128"/>
+         <match value="\n@phdthesis{" type="stringignorecase" offset="2:128"/>
+         <match value="\n@string{" type="stringignorecase" offset="2:128"/>
+         <match value="\n@techreport{" type="stringignorecase" offset="2:128"/>
+         <match value="\n@unpublished{" type="stringignorecase" 
offset="2:128"/>
+      </match>
     </magic>
     <glob pattern="*.bib"/>
     <glob pattern="*.bibtex"/>

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1683101&r1=1683100&r2=1683101&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
(original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
Tue Jun  2 13:15:21 2015
@@ -932,6 +932,12 @@ public class TestMimeTypes {
     }
     
     @Test
+    public void testTextFormats() throws Exception {
+        assertType("application/x-bibtex-text-file", "testBIBTEX.bib");
+        assertTypeByData("application/x-bibtex-text-file", "testBIBTEX.bib");
+    }
+    
+    @Test
     public void testCodeFormats() throws Exception {
         assertType("text/x-csrc", "testC.c");
         

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testBIBTEX.bib
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testBIBTEX.bib?rev=1683101&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testBIBTEX.bib 
(added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testBIBTEX.bib 
Tue Jun  2 13:15:21 2015
@@ -0,0 +1,21 @@
+% This is the bibliographic database in bibtex format for NASA LaRC FM  
+% Publications IT IS NOT UP TO DATE
+@string{CR = "{NASA} Contractor Report"}
+@string{CP = "{NASA} Conference Publication"}
+@string{TM = "{NASA} Technical Memorandum"}
+@string{TP = "{NASA} Technical Paper"}
+@string{ieeetse = {IEEE Transactions on Software Engineering}}
+@string{larc = {NASA Langley Research Center, Hampton, Virginia}}
+
+@TechReport{ detmode,
+  author = {Steven P. Miller and James N. Pott},
+  title = {{Detecting Mode Confusion Through
+Formal Modeling and Analysis}},
+  month = jan,
+  year = 1999,
+  institution = larc,
+  number = {{CR-1999-208971}},
+  keywords = {PVS, partitioning, avionics, IMA },
+  
url={http://wais-gw.larc.nasa.gov:81/techreports.larc.nasa.gov:210/ltrs_index/HTML/2279/1=techreports.larc.nasa.gov%3A210;2=/usr/local/web/waissrc/ltrs_index;3=0%202279%20/usr/local/web/htdocs/ltrs/refer/1999/cr/NASA-99-cr208971.refer.html;4=techreports.larc.nasa.gov%3A210;5=/usr/local/web/waissrc/ltrs_index;6=0%202279%20/usr/local/web/htdocs/ltrs/refer/1999/cr/NASA-99-cr208971.refer.html;7=%00;
 },                
+  type = CR
+}


Reply via email to