Author: nick
Date: Tue Jun 2 13:15:21 2015
New Revision: 1683101
URL: http://svn.apache.org/r1683101
Log:
Bibtex entries are case insensitive, and might start with a comment, so tweak
magic and add a test file. (Spotted in govdocs1)
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testBIBTEX.bib
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1683101&r1=1683100&r2=1683101&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
(original)
+++
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Tue Jun 2 13:15:21 2015
@@ -487,7 +487,7 @@
<match value="\xef\xbb\xbf%PDF-" type="string" offset="0"/>
</magic>
<magic priority="20">
- <!-- Low priority match for %PDF near the start of the file -->
+ <!-- Low priority match for %PDF-#.# near the start of the file -->
<!-- Can trigger false positives, so set the priority rather low here -->
<match value="%PDF-1." type="string" offset="1:512"/>
<match value="%PDF-2." type="string" offset="1:512"/>
@@ -2793,17 +2793,34 @@
<match value="%%%\ \ " type="string" offset="73"/>
<match value="%\ BibTeX\ standard\ bibliography\ " type="string"
offset="0"/>
<match value="%%%\ \ @BibTeX-style-file{" type="string" offset="73"/>
- <match value="@article{" type="string" offset="0"/>
- <match value="@book{" type="string" offset="0"/>
- <match value="@inbook{" type="string" offset="0"/>
- <match value="@incollection{" type="string" offset="0"/>
- <match value="@inproceedings{" type="string" offset="0"/>
- <match value="@manual{" type="string" offset="0"/>
- <match value="@misc{" type="string" offset="0"/>
- <match value="@preamble{" type="string" offset="0"/>
- <match value="@phdthesis{" type="string" offset="0"/>
- <match value="@techreport{" type="string" offset="0"/>
- <match value="@unpublished{" type="string" offset="0"/>
+ <match value="@article{" type="stringignorecase" offset="0"/>
+ <match value="@book{" type="stringignorecase" offset="0"/>
+ <match value="@inbook{" type="stringignorecase" offset="0"/>
+ <match value="@incollection{" type="stringignorecase" offset="0"/>
+ <match value="@inproceedings{" type="stringignorecase" offset="0"/>
+ <match value="@manual{" type="stringignorecase" offset="0"/>
+ <match value="@misc{" type="stringignorecase" offset="0"/>
+ <match value="@preamble{" type="stringignorecase" offset="0"/>
+ <match value="@phdthesis{" type="stringignorecase" offset="0"/>
+ <match value="@string{" type="stringignorecase" offset="0"/>
+ <match value="@techreport{" type="stringignorecase" offset="0"/>
+ <match value="@unpublished{" type="stringignorecase" offset="0"/>
+ </magic>
+ <magic priority="30">
+ <match value="%" type="string" offset="0">
+ <match value="\n@article{" type="stringignorecase" offset="2:128"/>
+ <match value="\n@book{" type="stringignorecase" offset="2:128"/>
+ <match value="\n@inbook{" type="stringignorecase" offset="2:128"/>
+ <match value="\n@incollection{" type="stringignorecase"
offset="2:128"/>
+ <match value="\n@inproceedings{" type="stringignorecase"
offset="2:128"/>
+ <match value="\n@manual{" type="stringignorecase" offset="2:128"/>
+ <match value="\n@misc{" type="stringignorecase" offset="2:128"/>
+ <match value="\n@preamble{" type="stringignorecase" offset="2:128"/>
+ <match value="\n@phdthesis{" type="stringignorecase" offset="2:128"/>
+ <match value="\n@string{" type="stringignorecase" offset="2:128"/>
+ <match value="\n@techreport{" type="stringignorecase" offset="2:128"/>
+ <match value="\n@unpublished{" type="stringignorecase"
offset="2:128"/>
+ </match>
</magic>
<glob pattern="*.bib"/>
<glob pattern="*.bibtex"/>
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1683101&r1=1683100&r2=1683101&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Tue Jun 2 13:15:21 2015
@@ -932,6 +932,12 @@ public class TestMimeTypes {
}
@Test
+ public void testTextFormats() throws Exception {
+ assertType("application/x-bibtex-text-file", "testBIBTEX.bib");
+ assertTypeByData("application/x-bibtex-text-file", "testBIBTEX.bib");
+ }
+
+ @Test
public void testCodeFormats() throws Exception {
assertType("text/x-csrc", "testC.c");
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testBIBTEX.bib
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testBIBTEX.bib?rev=1683101&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testBIBTEX.bib
(added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testBIBTEX.bib
Tue Jun 2 13:15:21 2015
@@ -0,0 +1,21 @@
+% This is the bibliographic database in bibtex format for NASA LaRC FM
+% Publications IT IS NOT UP TO DATE
+@string{CR = "{NASA} Contractor Report"}
+@string{CP = "{NASA} Conference Publication"}
+@string{TM = "{NASA} Technical Memorandum"}
+@string{TP = "{NASA} Technical Paper"}
+@string{ieeetse = {IEEE Transactions on Software Engineering}}
+@string{larc = {NASA Langley Research Center, Hampton, Virginia}}
+
+@TechReport{ detmode,
+ author = {Steven P. Miller and James N. Pott},
+ title = {{Detecting Mode Confusion Through
+Formal Modeling and Analysis}},
+ month = jan,
+ year = 1999,
+ institution = larc,
+ number = {{CR-1999-208971}},
+ keywords = {PVS, partitioning, avionics, IMA },
+
url={http://wais-gw.larc.nasa.gov:81/techreports.larc.nasa.gov:210/ltrs_index/HTML/2279/1=techreports.larc.nasa.gov%3A210;2=/usr/local/web/waissrc/ltrs_index;3=0%202279%20/usr/local/web/htdocs/ltrs/refer/1999/cr/NASA-99-cr208971.refer.html;4=techreports.larc.nasa.gov%3A210;5=/usr/local/web/waissrc/ltrs_index;6=0%202279%20/usr/local/web/htdocs/ltrs/refer/1999/cr/NASA-99-cr208971.refer.html;7=%00;
},
+ type = CR
+}