[4/4] tika git commit: PKCS7 signature detection tests, using test files from TIKA-1821
PKCS7 signature detection tests, using test files from TIKA-1821 Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/046e43f8 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/046e43f8 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/046e43f8 Branch: refs/heads/master Commit: 046e43f81c37f0ab92d9420fab8b645969d5a13c Parents: 57ae2c5 Author: Nick Burch <n...@gagravarr.org> Authored: Wed Feb 3 14:13:30 2016 + Committer: Nick Burch <n...@gagravarr.org> Committed: Wed Feb 3 14:13:30 2016 + -- .../test/java/org/apache/tika/mime/TestMimeTypes.java | 13 + 1 file changed, 13 insertions(+) -- http://git-wip-us.apache.org/repos/asf/tika/blob/046e43f8/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java -- diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java index 2f9193d..77d25df 100644 --- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java +++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java @@ -979,6 +979,19 @@ public class TestMimeTypes { assertTypeByData("text/vtt", "testWebVTT.vtt"); } +@Test +public void testPKCSSignatures() throws Exception { +// PKCS7 Signed XML files +assertType("application/pkcs7-signature", "testPKCS17Sig.xml.p7m"); +assertType("application/pkcs7-signature", "testPKCS17Sig-v2.xml.p7m"); +assertType("application/pkcs7-signature", "testPKCS17Sig-v3.xml.p7m"); +assertType("application/pkcs7-signature", "testPKCS17Sig-v4.xml.p7m"); +assertTypeByData("application/pkcs7-signature", "testPKCS17Sig.xml.p7m"); +assertTypeByData("application/pkcs7-signature", "testPKCS17Sig-v2.xml.p7m"); +assertTypeByData("application/pkcs7-signature", "testPKCS17Sig-v3.xml.p7m"); +assertTypeByData("application/pkcs7-signature", "testPKCS17Sig-v4.xml.p7m"); +} + private void assertText(byte[] prefix) throws IOException { assertMagic("text/plain", prefix); }
[3/4] tika git commit: Unit test for detecting JS files
Unit test for detecting JS files As we don't currently have any JS file magic, we can't detect as such without the file name. However, with the filename, ensure we do get it right, even if there's HTML snippet in the JS. TIKA-1141 Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/557b3704 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/557b3704 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/557b3704 Branch: refs/heads/master Commit: 557b3704501a9692809a3e1b7838866786ed3366 Parents: d740f5d Author: Nick Burch <n...@gagravarr.org> Authored: Wed Feb 3 17:20:55 2016 + Committer: Nick Burch <n...@gagravarr.org> Committed: Wed Feb 3 17:25:52 2016 + -- .../test/java/org/apache/tika/mime/TestMimeTypes.java | 12 1 file changed, 12 insertions(+) -- http://git-wip-us.apache.org/repos/asf/tika/blob/557b3704/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java -- diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java index 77d25df..92f7b88 100644 --- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java +++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java @@ -971,6 +971,18 @@ public class TestMimeTypes { assertTypeByData("text/x-matlab", "testMATLAB.m"); assertTypeByData("text/x-matlab", "testMATLAB_wtsgaus.m"); assertTypeByData("text/x-matlab", "testMATLAB_barcast.m"); + +// By name, or by name+data, gets it as JS +assertTypeByName("application/javascript", "testJS.js"); +assertTypeByName("application/javascript", "testJS_HTML.js"); +assertType("application/javascript", "testJS.js"); +assertType("application/javascript", "testJS_HTML.js"); + +// With data only, because we have no JS file magic, can't be +// detected. One will come through as plain text, the other +// as HTML due to in it. TODO Add JS magic. See TIKA-1141 +//assertTypeByData("application/javascript", "testJS.js"); +//assertTypeByData("application/javascript", "testJS_HTML.js"); } @Test
[2/4] tika git commit: Lower the priority of
Lower the priority of http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d740f5d8 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d740f5d8 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d740f5d8 Branch: refs/heads/master Commit: d740f5d8b2e42b1db42806ddd395e034cb416fd4 Parents: d8a2fc0 Author: Nick Burch <n...@gagravarr.org> Authored: Wed Feb 3 17:11:06 2016 + Committer: Nick Burch <n...@gagravarr.org> Committed: Wed Feb 3 17:11:06 2016 + -- .../org/apache/tika/mime/tika-mimetypes.xml| 17 +++-- 1 file changed, 11 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/tika/blob/d740f5d8/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml -- diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index 1d7b42b..95f41e6 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -5432,12 +5432,6 @@ - - @@ -5449,6 +5443,17 @@ + + + + + +
[4/4] tika git commit: Merge branch 'master' of https://git-wip-us.apache.org/repos/asf/tika
Merge branch 'master' of https://git-wip-us.apache.org/repos/asf/tika Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/6c0b7906 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/6c0b7906 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/6c0b7906 Branch: refs/heads/master Commit: 6c0b7906ecbc22ea9adb4c1e5781b0eff561957d Parents: 557b370 1e0159b Author: Nick Burch <n...@gagravarr.org> Authored: Wed Feb 3 17:26:09 2016 + Committer: Nick Burch <n...@gagravarr.org> Committed: Wed Feb 3 17:26:09 2016 + -- .../tika/parser/rtf/RTFEmbObjHandler.java | 2 +- .../apache/tika/parser/rtf/RTFParserTest.java | 9 +- .../tika/server/resource/TikaResource.java | 14 +- .../apache/tika/server/TikaResourceTest.java| 12 + .../testRTF_npeFromWMFInTikaServer.rtf | 235 +++ 5 files changed, 262 insertions(+), 10 deletions(-) --
[1/4] tika git commit: Test JS file that includes
Repository: tika Updated Branches: refs/heads/master 1e0159b73 -> 6c0b7906e Test JS file that includes http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d8a2fc01 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d8a2fc01 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d8a2fc01 Branch: refs/heads/master Commit: d8a2fc01b4da5ffb7be19864512401c54aa04bfd Parents: 046e43f Author: Nick Burch <n...@gagravarr.org> Authored: Wed Feb 3 17:10:33 2016 + Committer: Nick Burch <n...@gagravarr.org> Committed: Wed Feb 3 17:10:33 2016 + -- .../resources/test-documents/testJS_HTML.js | 91 1 file changed, 91 insertions(+) -- http://git-wip-us.apache.org/repos/asf/tika/blob/d8a2fc01/tika-parsers/src/test/resources/test-documents/testJS_HTML.js -- diff --git a/tika-parsers/src/test/resources/test-documents/testJS_HTML.js b/tika-parsers/src/test/resources/test-documents/testJS_HTML.js new file mode 100644 index 000..a362198 --- /dev/null +++ b/tika-parsers/src/test/resources/test-documents/testJS_HTML.js @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +var places = new Array(); + +places[0] = { + 'name': 'Oxford', lat: 51.75222, lng: -1.25596, + 'id': 'map_1', +} +places[1] = { + 'name': 'Oxford', lat: 41.43399, lng: -73.11678, + 'id': 'map_2', +} +places[2] = { + 'name': 'Oxford', lat: -43.3, lng: 172.18333, + 'id': 'map_3', +} +places[3] = { + 'name': 'Oxford', lat: 33.619, lng: -83.86741, + 'id': 'map_4', +} +places[4] = { + 'name': 'Oxford', lat: 44.13174, lng: -70.49311, + 'id': 'map_5', +} +places[5] = { + 'name': 'Oxford', lat: 39.78539, lng: -75.97883, + 'id': 'map_6', +} +places[6] = { + 'name': 'Oxford', lat: 40.51976, lng: -87.24779, + 'id': 'map_7', +} +places[7] = { + 'name': 'Oxford', lat: 45.73345, lng: -63.86542, + 'id': 'map_8', +} +places[8] = { + 'name': 'Oxford', lat: 42.44202, lng: -75.59769, + 'id': 'map_9', +} +places[9] = { + 'name': 'Oxford', lat: 40.80315, lng: -74.98962, + 'id': 'map_10', +} + +function drawMaps() { + if (GBrowserIsCompatible()) { + for(var i in places) { + var p = places[i]; + var div = document.getElementById(p['id']); + + div.style.display = "block"; + div.parentNode.style.marginBottom = "35px"; + + var map = new GMap2(div); + map.setCenter(new GLatLng(p['lat'], p['lng']), 8); + + var m = new GMarker( +new GLatLng(p['lat'], p['lng']), +{title: p['name']} + ); + map.addOverlay(m); + } + } else { + document.write("Unsupported Browser"); + } +} + +var t; +$(document).ready(function(){ + t = setTimeout(function() { + clearTimeout(t); + drawMaps(); + }, 15*1000); +});
[1/2] tika git commit: TIKA-1823 Sample AutoCAD 2010 DWF file
Repository: tika Updated Branches: refs/heads/master 5c0ef63e4 -> 6a0923326 TIKA-1823 Sample AutoCAD 2010 DWF file AutoCAD supplied sample file for AutoCAD 2010 DWF, from https://knowledge.autodesk.com/support/autocad/downloads/caas/downloads/content/autocad-sample-files.html Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/38fbc504 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/38fbc504 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/38fbc504 Branch: refs/heads/master Commit: 38fbc504944732f6aefddc3ce7e802a5103b6f89 Parents: 5c0ef63 Author: Nick Burch <n...@gagravarr.org> Authored: Tue Jan 26 16:26:55 2016 + Committer: Nick Burch <n...@gagravarr.org> Committed: Tue Jan 26 16:26:55 2016 + -- .../test/resources/test-documents/testDWF2010.dwf | Bin 0 -> 101370 bytes 1 file changed, 0 insertions(+), 0 deletions(-) -- http://git-wip-us.apache.org/repos/asf/tika/blob/38fbc504/tika-parsers/src/test/resources/test-documents/testDWF2010.dwf -- diff --git a/tika-parsers/src/test/resources/test-documents/testDWF2010.dwf b/tika-parsers/src/test/resources/test-documents/testDWF2010.dwf new file mode 100644 index 000..f72f4e6 Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testDWF2010.dwf differ
[2/2] tika git commit: TIKA-1823 AutoCAD DWF mime magic and subtypes
TIKA-1823 AutoCAD DWF mime magic and subtypes Parent AutoCAD DWF mimetype and general magic, based on patch from Luca Moretti, along with version-specific subtypes with more specific magic Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/6a092332 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/6a092332 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/6a092332 Branch: refs/heads/master Commit: 6a092332686e02ba26456e52fb0ce8bc5b42be56 Parents: 38fbc50 Author: Nick Burch <n...@gagravarr.org> Authored: Tue Jan 26 16:31:17 2016 + Committer: Nick Burch <n...@gagravarr.org> Committed: Tue Jan 26 16:31:17 2016 + -- .../org/apache/tika/mime/tika-mimetypes.xml | 30 .../org/apache/tika/mime/TestMimeTypes.java | 5 2 files changed, 35 insertions(+) -- http://git-wip-us.apache.org/repos/asf/tika/blob/6a092332/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml -- diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index 5d152a5..1d7b42b 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -5250,8 +5250,38 @@ +DWF +<_comment>AutoCAD Design Web Format + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_comment>AutoCAD Design Web Format http://git-wip-us.apache.org/repos/asf/tika/blob/6a092332/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java -- diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java index 28aae81..2f9193d 100644 --- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java +++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java @@ -538,6 +538,11 @@ public class TestMimeTypes { assertTypeByData("image/vnd.dwg", "testDWG2010.dwg"); // From name, gets the common parent type +assertTypeByName("model/vnd.dwf", "x.dwf"); +// With the data, can work out it's the v6 zip-based flavour +assertTypeByData("model/vnd.dwf; version=6", "testDWF2010.dwf"); + +// From name, gets the common parent type assertTypeByName("image/vnd.dxf", "x.dxf"); // With the data, can work out it's the ASCII flavour assertTypeByData("image/vnd.dxf; format=ascii", "testDXF_ascii.dxf");
svn commit: r1726164 - in /tika/site/src/site/apt: 0.10/ 0.7/ 0.8/ 0.9/ 1.0/ 1.1/ 1.10/ 1.11/ 1.2/ 1.3/ 1.4/ 1.5/ 1.6/ 1.7/ 1.8/ 1.9/
Author: nick Date: Fri Jan 22 09:37:17 2016 New Revision: 1726164 URL: http://svn.apache.org/viewvc?rev=1726164=rev Log: Remove odd extra header line Modified: tika/site/src/site/apt/0.10/parser_guide.apt tika/site/src/site/apt/0.7/parser_guide.apt tika/site/src/site/apt/0.8/parser_guide.apt tika/site/src/site/apt/0.9/parser_guide.apt tika/site/src/site/apt/1.0/parser_guide.apt tika/site/src/site/apt/1.1/parser_guide.apt tika/site/src/site/apt/1.10/parser_guide.apt tika/site/src/site/apt/1.11/parser_guide.apt tika/site/src/site/apt/1.2/parser_guide.apt tika/site/src/site/apt/1.3/parser_guide.apt tika/site/src/site/apt/1.4/parser_guide.apt tika/site/src/site/apt/1.5/parser_guide.apt tika/site/src/site/apt/1.6/parser_guide.apt tika/site/src/site/apt/1.7/parser_guide.apt tika/site/src/site/apt/1.8/parser_guide.apt tika/site/src/site/apt/1.9/parser_guide.apt Modified: tika/site/src/site/apt/0.10/parser_guide.apt URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/0.10/parser_guide.apt?rev=1726164=1726163=1726164=diff == --- tika/site/src/site/apt/0.10/parser_guide.apt (original) +++ tika/site/src/site/apt/0.10/parser_guide.apt Fri Jan 22 09:37:17 2016 @@ -1,8 +1,6 @@ Get Tika parsing up and running in 5 minutes - Arturo Beltran - ~~ Licensed to the Apache Software Foundation (ASF) under one or more ~~ contributor license agreements. See the NOTICE file distributed with Modified: tika/site/src/site/apt/0.7/parser_guide.apt URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/0.7/parser_guide.apt?rev=1726164=1726163=1726164=diff == --- tika/site/src/site/apt/0.7/parser_guide.apt (original) +++ tika/site/src/site/apt/0.7/parser_guide.apt Fri Jan 22 09:37:17 2016 @@ -1,8 +1,6 @@ Get Tika parsing up and running in 5 minutes - Arturo Beltran - ~~ Licensed to the Apache Software Foundation (ASF) under one or more ~~ contributor license agreements. See the NOTICE file distributed with Modified: tika/site/src/site/apt/0.8/parser_guide.apt URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/0.8/parser_guide.apt?rev=1726164=1726163=1726164=diff == --- tika/site/src/site/apt/0.8/parser_guide.apt (original) +++ tika/site/src/site/apt/0.8/parser_guide.apt Fri Jan 22 09:37:17 2016 @@ -1,8 +1,6 @@ Get Tika parsing up and running in 5 minutes - Arturo Beltran - ~~ Licensed to the Apache Software Foundation (ASF) under one or more ~~ contributor license agreements. See the NOTICE file distributed with Modified: tika/site/src/site/apt/0.9/parser_guide.apt URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/0.9/parser_guide.apt?rev=1726164=1726163=1726164=diff == --- tika/site/src/site/apt/0.9/parser_guide.apt (original) +++ tika/site/src/site/apt/0.9/parser_guide.apt Fri Jan 22 09:37:17 2016 @@ -1,8 +1,6 @@ Get Tika parsing up and running in 5 minutes - Arturo Beltran - ~~ Licensed to the Apache Software Foundation (ASF) under one or more ~~ contributor license agreements. See the NOTICE file distributed with Modified: tika/site/src/site/apt/1.0/parser_guide.apt URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/1.0/parser_guide.apt?rev=1726164=1726163=1726164=diff == --- tika/site/src/site/apt/1.0/parser_guide.apt (original) +++ tika/site/src/site/apt/1.0/parser_guide.apt Fri Jan 22 09:37:17 2016 @@ -1,8 +1,6
svn commit: r1726166 - in /tika/site/publish: ./ 0.10/ 0.7/ 0.8/ 0.9/ 1.0/ 1.1/ 1.10/ 1.11/ 1.2/ 1.3/ 1.4/ 1.5/ 1.6/ 1.7/ 1.8/ 1.9/
Author: nick Date: Fri Jan 22 09:45:05 2016 New Revision: 1726166 URL: http://svn.apache.org/viewvc?rev=1726166=rev Log: Republish the site for Git updates Modified: tika/site/publish/0.10/parser_guide.html tika/site/publish/0.7/parser_guide.html tika/site/publish/0.8/parser_guide.html tika/site/publish/0.9/parser_guide.html tika/site/publish/1.0/parser_guide.html tika/site/publish/1.1/parser_guide.html tika/site/publish/1.10/parser_guide.html tika/site/publish/1.11/parser_guide.html tika/site/publish/1.2/parser_guide.html tika/site/publish/1.3/parser_guide.html tika/site/publish/1.4/parser_guide.html tika/site/publish/1.5/parser_guide.html tika/site/publish/1.6/parser_guide.html tika/site/publish/1.7/parser_guide.html tika/site/publish/1.8/parser_guide.html tika/site/publish/1.9/parser_guide.html tika/site/publish/contribute.html Modified: tika/site/publish/0.10/parser_guide.html URL: http://svn.apache.org/viewvc/tika/site/publish/0.10/parser_guide.html?rev=1726166=1726165=1726166=diff == --- tika/site/publish/0.10/parser_guide.html (original) +++ tika/site/publish/0.10/parser_guide.html Fri Jan 22 09:45:05 2016 @@ -99,7 +99,7 @@ The Getting Started document describes how to build Apache Tika from sources and how to start using Tika in an application. Pay close attention and follow the instructions in the Getting and building the sources section. Add your MIME-Type -You first need to modify http://svn.apache.org/repos/asf/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml;>tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml in order to Tika can map the file extension with its MIME-Type. You should add something like this: +You first need to modify https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml;hb=refs/heads/master;>tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml in order to Tika can map the file extension with its MIME-Type. You should add something like this: mime-type type=application/hello glob pattern=*.hi/ @@ -178,7 +178,7 @@ public class HelloParser implements Pars List the new parser Finally, you should explicitly tell the AutoDetectParser to include your new parser. This step is only needed if you want to use the AutoDetectParser functionality. If you figure out the correct parser in a different way, it isn't needed. -List your new parser in: http://svn.apache.org/repos/asf/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser;>tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser +List your new parser in: https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser;hb=refs/heads/master;>tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser @@ -352,7 +352,7 @@ public class HelloParser implements Pars - Copyright 2015 + Copyright 2016 http://www.apache.org/;>The Apache Software Foundation. Site powered by http://maven.apache.org/;>Apache Maven. Search powered by Modified: tika/site/publish/0.7/parser_guide.html URL: http://svn.apache.org/viewvc/tika/site/publish/0.7/parser_guide.html?rev=1726166=1726165=1726166=diff == --- tika/site/publish/0.7/parser_guide.html (original) +++ tika/site/publish/0.7/parser_guide.html Fri Jan 22 09:45:05 2016 @@ -99,7 +99,7 @@ The Getting Started document describes how to build Apache Tika from sources and how to start using Tika in an application. Pay close attention and follow the instructions in the Getting and building the sources section. Add your MIME-Type -You first need to modify http://svn.apache.org/repos/asf/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml;>tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml in order to Tika can map the file extension with its MIME-Type. You should add something like this: +You first need to modify https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml;hb=refs/heads/master;>tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml in order to Tika can map the file extension with its MIME-Type. You should add something like this: mime-type type=application/hello glob pattern=*.hi/ @@ -178,7 +178,7 @@ public class HelloParser implements Pars List the new parser Finally, you should explicitly tell the AutoDetectParser to include your new parser. This step is only n
svn commit: r1726163 - in /tika/site/src/site/apt: 0.10/ 0.7/ 0.8/ 0.9/ 1.0/ 1.1/ 1.10/ 1.11/ 1.2/ 1.3/ 1.4/ 1.5/ 1.6/ 1.7/ 1.8/ 1.9/
Author: nick Date: Fri Jan 22 09:35:27 2016 New Revision: 1726163 URL: http://svn.apache.org/viewvc?rev=1726163=rev Log: Change SVN view URL to a Git one Modified: tika/site/src/site/apt/0.10/parser_guide.apt tika/site/src/site/apt/0.7/parser_guide.apt tika/site/src/site/apt/0.8/parser_guide.apt tika/site/src/site/apt/0.9/parser_guide.apt tika/site/src/site/apt/1.0/parser_guide.apt tika/site/src/site/apt/1.1/parser_guide.apt tika/site/src/site/apt/1.10/parser_guide.apt tika/site/src/site/apt/1.11/parser_guide.apt tika/site/src/site/apt/1.2/parser_guide.apt tika/site/src/site/apt/1.3/parser_guide.apt tika/site/src/site/apt/1.4/parser_guide.apt tika/site/src/site/apt/1.5/parser_guide.apt tika/site/src/site/apt/1.6/parser_guide.apt tika/site/src/site/apt/1.7/parser_guide.apt tika/site/src/site/apt/1.8/parser_guide.apt tika/site/src/site/apt/1.9/parser_guide.apt Modified: tika/site/src/site/apt/0.10/parser_guide.apt URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/0.10/parser_guide.apt?rev=1726163=1726162=1726163=diff == --- tika/site/src/site/apt/0.10/parser_guide.apt (original) +++ tika/site/src/site/apt/0.10/parser_guide.apt Fri Jan 22 09:35:27 2016 @@ -35,7 +35,7 @@ Get Tika parsing up and running in 5 min * {Add your MIME-Type} - You first need to modify {{{http://svn.apache.org/repos/asf/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}} + You first need to modify {{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml;hb=refs/heads/master}tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}} in order to Tika can map the file extension with its MIME-Type. You should add something like this: --- Modified: tika/site/src/site/apt/0.7/parser_guide.apt URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/0.7/parser_guide.apt?rev=1726163=1726162=1726163=diff == --- tika/site/src/site/apt/0.7/parser_guide.apt (original) +++ tika/site/src/site/apt/0.7/parser_guide.apt Fri Jan 22 09:35:27 2016 @@ -35,7 +35,7 @@ Get Tika parsing up and running in 5 min * {Add your MIME-Type} - You first need to modify {{{http://svn.apache.org/repos/asf/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}} + You first need to modify {{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml;hb=refs/heads/master}tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}} in order to Tika can map the file extension with its MIME-Type. You should add something like this: --- Modified: tika/site/src/site/apt/0.8/parser_guide.apt URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/0.8/parser_guide.apt?rev=1726163=1726162=1726163=diff == --- tika/site/src/site/apt/0.8/parser_guide.apt (original) +++ tika/site/src/site/apt/0.8/parser_guide.apt Fri Jan 22 09:35:27 2016 @@ -35,7 +35,7 @@ Get Tika parsing up and running in 5 min * {Add your MIME-Type} - You first need to modify {{{http://svn.apache.org/repos/asf/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}} + You first need to modify {{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml;hb=refs/heads/master}tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}} in order to Tika can map the file extension with its MIME-Type. You should add something like this: --- Modified: tika/site/src/site/apt/0.9/parser_guide.apt URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/0.9/parser_guide.apt?rev=1726163=1726162=1726163=diff == --- tika/site/src/site/apt/0.9/parser_guide.apt (original) +++ tika/site/src/site/apt/0.9/parser_guide.apt Fri Jan 22 09:35:27 2016 @@ -35,7 +35,7 @@ Get Tika parsing up and running in 5 min * {Add your MIME-Type} - You first need to modify {{{http://svn.apache.org/repos/asf/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}} + You first need to modify {{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml;hb=refs/heads/master}tika-core/src/main/resources/org/apache
svn commit: r1726165 - in /tika/site/src/site/apt: 0.10/ 0.7/ 0.8/ 0.9/ 1.0/ 1.1/ 1.10/ 1.11/ 1.2/ 1.3/ 1.4/ 1.5/ 1.6/ 1.7/ 1.8/ 1.9/
Author: nick Date: Fri Jan 22 09:40:16 2016 New Revision: 1726165 URL: http://svn.apache.org/viewvc?rev=1726165=rev Log: Update another SVN view link to a Git view link Modified: tika/site/src/site/apt/0.10/parser_guide.apt tika/site/src/site/apt/0.7/parser_guide.apt tika/site/src/site/apt/0.8/parser_guide.apt tika/site/src/site/apt/0.9/parser_guide.apt tika/site/src/site/apt/1.0/parser_guide.apt tika/site/src/site/apt/1.1/parser_guide.apt tika/site/src/site/apt/1.10/parser_guide.apt tika/site/src/site/apt/1.11/parser_guide.apt tika/site/src/site/apt/1.2/parser_guide.apt tika/site/src/site/apt/1.3/parser_guide.apt tika/site/src/site/apt/1.4/parser_guide.apt tika/site/src/site/apt/1.5/parser_guide.apt tika/site/src/site/apt/1.6/parser_guide.apt tika/site/src/site/apt/1.7/parser_guide.apt tika/site/src/site/apt/1.8/parser_guide.apt tika/site/src/site/apt/1.9/parser_guide.apt Modified: tika/site/src/site/apt/0.10/parser_guide.apt URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/0.10/parser_guide.apt?rev=1726165=1726164=1726165=diff == --- tika/site/src/site/apt/0.10/parser_guide.apt (original) +++ tika/site/src/site/apt/0.10/parser_guide.apt Fri Jan 22 09:40:16 2016 @@ -128,6 +128,6 @@ public class HelloParser implements Pars If you figure out the correct parser in a different way, it isn't needed. List your new parser in: - {{{http://svn.apache.org/repos/asf/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}} + {{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser;hb=refs/heads/master}tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}} Modified: tika/site/src/site/apt/0.7/parser_guide.apt URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/0.7/parser_guide.apt?rev=1726165=1726164=1726165=diff == --- tika/site/src/site/apt/0.7/parser_guide.apt (original) +++ tika/site/src/site/apt/0.7/parser_guide.apt Fri Jan 22 09:40:16 2016 @@ -128,6 +128,6 @@ public class HelloParser implements Pars If you figure out the correct parser in a different way, it isn't needed. List your new parser in: - {{{http://svn.apache.org/repos/asf/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}} + {{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser;hb=refs/heads/master}tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}} Modified: tika/site/src/site/apt/0.8/parser_guide.apt URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/0.8/parser_guide.apt?rev=1726165=1726164=1726165=diff == --- tika/site/src/site/apt/0.8/parser_guide.apt (original) +++ tika/site/src/site/apt/0.8/parser_guide.apt Fri Jan 22 09:40:16 2016 @@ -128,6 +128,6 @@ public class HelloParser implements Pars If you figure out the correct parser in a different way, it isn't needed. List your new parser in: - {{{http://svn.apache.org/repos/asf/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}} + {{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser;hb=refs/heads/master}tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}} Modified: tika/site/src/site/apt/0.9/parser_guide.apt URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/0.9/parser_guide.apt?rev=1726165=1726164=1726165=diff == --- tika/site/src/site/apt/0.9/parser_guide.apt (original) +++ tika/site/src/site/apt/0.9/parser_guide.apt Fri Jan 22 09:40:16 2016 @@ -128,6 +128,6 @@ public class HelloParser implements Pars If you figure out the correct parser in a different way, it isn't needed. List your new parser in: - {{{http://svn.apache.org/repos/asf/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}} + {{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser;hb=refs/heads/master}tika-parsers/src/main/resources
svn commit: r1726153 - /tika/site/src/site/apt/contribute.apt.vm
Author: nick Date: Fri Jan 22 07:43:29 2016 New Revision: 1726153 URL: http://svn.apache.org/viewvc?rev=1726153=rev Log: Update the contributing guide for Git - other SVN references still remain Modified: tika/site/src/site/apt/contribute.apt.vm Modified: tika/site/src/site/apt/contribute.apt.vm URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/contribute.apt.vm?rev=1726153=1726152=1726153=diff == --- tika/site/src/site/apt/contribute.apt.vm (original) +++ tika/site/src/site/apt/contribute.apt.vm Fri Jan 22 07:43:29 2016 @@ -30,14 +30,13 @@ Source Code To download the source code for the latest release of Apache Tika, please see the {{{./download.html}Download page}}. - The master copy of the Apache Tika source code is held in SVN. You can - checkout the code from - {{{https://svn.apache.org/repos/asf/tika/trunk}https://svn.apache.org/repos/asf/tika/trunk}} + The master copy of the Apache Tika source code is held in GIT. You can + clone (checkout) the code from + {{{https://git-wip-us.apache.org/repos/asf/tika.git}https://git-wip-us.apache.org/repos/asf/tika.git}} and you can browse it online through - {{{http://svn.apache.org/viewvc/tika/trunk/}Viewvc}} + {{{https://git-wip-us.apache.org/repos/asf?p=tika.git}Git web interface}} - For those who prefer working with Git, a read only mirror is available - from {{{http://git.apache.org/}git.apache.org}}. We also maintain a + For those who prefer working on GitHub, we also maintain a {{{https://github.com/apache/tika/}GitHub mirror}}, which you are welcome to fork from and open pull requests to. @@ -76,13 +75,9 @@ Submitting Enhancements and Fixes / new code. The JIRA can be used for discussions on the code, and provides a single identifier for the change. - SVN - For users of SVN, you can use <<>> to generate a patch - file of your changes, which can then be attached to the issue. Note that - a SVN diff won't normally include new or binary files, so these will need - to be attached separately. - - Git - Git users can run <<>> to generate an SVN - compatible patch which can then be attached to an issue. + Git - Git users can run <<>> to generate a patch + of changed and new files, including binaries, which can then be attached + to an issue. Github Pulls - If you are working from our {{{https://github.com/apache/tika/}GitHub mirror}}, it is possible to
svn commit: r1723581 - /tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Author: nick Date: Thu Jan 7 15:53:47 2016 New Revision: 1723581 URL: http://svn.apache.org/viewvc?rev=1723581=rev Log: Try to make the common parts clearer for the DER-encoded PKCS7 signature (length comes between 0x308. and the pkcs7 object) Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1723581=1723580=1723581=diff == --- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original) +++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Thu Jan 7 15:53:47 2016 @@ -524,15 +524,29 @@ + - - - - + + + + + + + + + + + + + + + +
svn commit: r1721390 - /tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Author: nick Date: Tue Dec 22 13:01:38 2015 New Revision: 1721390 URL: http://svn.apache.org/viewvc?rev=1721390=rev Log: TIKA-1817 Mime magic for AutoCAD DXF in Ascii and Binary, plus the related DXB Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1721390=1721389=1721390=diff == --- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original) +++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Tue Dec 22 13:01:38 2015 @@ -4795,12 +4795,41 @@ + +DXB +<_comment>AutoCAD DXF simplified Binary +http://en.wikipedia.org/wiki/AutoCAD_DXF + + + + + + DXF <_comment>AutoCAD DXF http://en.wikipedia.org/wiki/AutoCAD_DXF + + + +<_comment>AutoCAD DXF in Binary form + + + + + + +<_comment>AutoCAD DXF in ASCII Text form + + + + + + + +
svn commit: r1717559 - /tika/branches/2.x/tika-core/pom.xml
Author: nick Date: Wed Dec 2 00:33:37 2015 New Revision: 1717559 URL: http://svn.apache.org/viewvc?rev=1717559=rev Log: Change what CLIRR checks against - we expect breakages vs Tika Core 1.0, that is why it is 2.0! Modified: tika/branches/2.x/tika-core/pom.xml Modified: tika/branches/2.x/tika-core/pom.xml URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-core/pom.xml?rev=1717559=1717558=1717559=diff == --- tika/branches/2.x/tika-core/pom.xml (original) +++ tika/branches/2.x/tika-core/pom.xml Wed Dec 2 00:33:37 2015 @@ -112,19 +112,15 @@ -org/apache/tika/config/TikaActivator -org/apache/tika/metadata/Property$PropertyType -org/apache/tika/metadata/Property$ValueType -org/apache/tika/metadata/DublinCore -org/apache/tika/metadata/Metadata -org/apache/tika/metadata/MSOffice -org/apache/tika/parser/EmptyParser +org/apache/tika/config/LoadErrorHandler org.apache.tika tika-core - 1.0 + + + 1.11 jar
svn commit: r1717560 - in /tika/branches/2.x/tika-core/src: main/java/org/apache/tika/config/ test/java/org/apache/tika/config/
Author: nick Date: Wed Dec 2 00:33:41 2015 New Revision: 1717560 URL: http://svn.apache.org/viewvc?rev=1717560=rev Log: TIKA-1805 Notify via LoadErrorHandler if DefaultParser or DefaultDetector could not find any implementations of their service classes Modified: tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java tika/branches/2.x/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java Modified: tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java?rev=1717560=1717559=1717560=diff == --- tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java (original) +++ tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java Wed Dec 2 00:33:41 2015 @@ -39,6 +39,16 @@ public interface LoadErrorHandler { * @param throwable the encountered problem */ void handleLoadError(String classname, Throwable throwable); + +/** + * Handles the case of no occurrences of the specified service interface + * being found. The implementation can log or otherwise process + * the given error information. If the method returns normally, then + * the service loader simply returns an empty list to the caller. + * + * @param interfacename name of the service interface with no occurrences + */ +void handleNoOccurrences(String interfacename); /** * Strategy that simply ignores all problems. @@ -46,6 +56,8 @@ public interface LoadErrorHandler { LoadErrorHandler IGNORE = new LoadErrorHandler() { public void handleLoadError(String classname, Throwable throwable) { } +public void handleNoOccurrences(String interfacename) { +} @Override public String toString() { return "IGNORE"; @@ -61,6 +73,10 @@ public interface LoadErrorHandler { Logger.getLogger(classname).log( Level.WARNING, "Unable to load " + classname, throwable); } +public void handleNoOccurrences(String interfacename) { +Logger.getLogger(interfacename).log( +Level.WARNING, "No occurrences found of " + interfacename); +} @Override public String toString() { return "WARN"; @@ -76,6 +92,9 @@ public interface LoadErrorHandler { public void handleLoadError(String classname, Throwable throwable) { throw new RuntimeException("Unable to load " + classname, throwable); } +public void handleNoOccurrences(String interfacename) { +throw new RuntimeException("No occurrences found of " + interfacename); +} @Override public String toString() { return "THROW"; Modified: tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java?rev=1717560=1717559=1717560=diff == --- tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java (original) +++ tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java Wed Dec 2 00:33:41 2015 @@ -334,6 +334,9 @@ public class ServiceLoader { } } } +if (providers.isEmpty()) { +handler.handleNoOccurrences(iface.getName()); +} return providers; } Modified: tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1717560=1717559=1717560=diff == --- tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original) +++ tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Wed Dec 2 00:33:41 2015 @@ -162,8 +162,8 @@ public class TikaConfig { ExecutorServiceXmlLoader executorLoader = new ExecutorServiceXmlLoader(); this.mimeTypes = typesFromDomElement(element); -this.detector = detectorLoader.loadOverall(element, mimeTypes, loader); this.parser = parserLoader.loadOverall(element, mimeTypes, loader); +this.detector = detectorLoader.loadOverall(element, mimeT
svn commit: r1717557 - in /tika/branches/2.x: CHANGES.txt tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
Author: nick Date: Tue Dec 1 23:58:32 2015 New Revision: 1717557 URL: http://svn.apache.org/viewvc?rev=1717557=rev Log: Change the default LoadErrorHandler for Tika 2.x to be warn (TIKA-1805) Modified: tika/branches/2.x/CHANGES.txt tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Modified: tika/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/tika/branches/2.x/CHANGES.txt?rev=1717557=1717556=1717557=diff == --- tika/branches/2.x/CHANGES.txt (original) +++ tika/branches/2.x/CHANGES.txt Tue Dec 1 23:58:32 2015 @@ -1,3 +1,11 @@ +Release 2.0 - Future Development + + * The default LoadErrorHandler is now WARN, to alert you to missing +parser classes and their dependencies. To keep the old behaviour, +set your LoadErrorHandler to IGNORE. (TIKA-1805) + + * (Something about more specific parser bundles, plus an overall one) + Release 1.12 - Current Development * A parser to compute motion properties in Videos, e.g., Modified: tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1717557=1717556=1717557=diff == --- tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original) +++ tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Tue Dec 1 23:58:32 2015 @@ -447,10 +447,10 @@ public class TikaConfig { ServiceLoader serviceLoader; if (serviceLoaderElement != null) { boolean dynamic = Boolean.parseBoolean(serviceLoaderElement.getAttribute("dynamic")); -LoadErrorHandler loadErrorHandler = LoadErrorHandler.IGNORE; +LoadErrorHandler loadErrorHandler = LoadErrorHandler.WARN; String loadErrorHandleConfig = serviceLoaderElement.getAttribute("loadErrorHandler"); - if(LoadErrorHandler.WARN.toString().equalsIgnoreCase(loadErrorHandleConfig)) { -loadErrorHandler = LoadErrorHandler.WARN; + if(LoadErrorHandler.IGNORE.toString().equalsIgnoreCase(loadErrorHandleConfig)) { +loadErrorHandler = LoadErrorHandler.IGNORE; } else if(LoadErrorHandler.THROW.toString().equalsIgnoreCase(loadErrorHandleConfig)) { loadErrorHandler = LoadErrorHandler.THROW; }
svn commit: r1714493 - /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
Author: nick Date: Sun Nov 15 19:56:25 2015 New Revision: 1714493 URL: http://svn.apache.org/viewvc?rev=1714493=rev Log: Fix inconsistent whitespace Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java?rev=1714493=1714492=1714493=diff == --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java Sun Nov 15 19:56:25 2015 @@ -46,139 +46,139 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; public class GeoParser extends AbstractParser { - private static final long serialVersionUID = -2241391757440215491L; -private static final Logger LOG = Logger.getLogger(GeoParser.class.getName()); - private static final MediaType MEDIA_TYPE = - MediaType.application("geotopic"); - private static final Set SUPPORTED_TYPES = - Collections.singleton(MEDIA_TYPE); - private GeoParserConfig config = new GeoParserConfig(); - - private boolean initialized; - private URL modelUrl; - private NameEntityExtractor extractor; - private boolean available; - - @Override - public Set getSupportedTypes(ParseContext parseContext) { - return SUPPORTED_TYPES; - } - - /** -* Initializes this parser -* @param modelUrl the URL to NER model -*/ - public void initialize(URL modelUrl) { - - if (this.modelUrl != null && this.modelUrl.equals(modelUrl)) { - //previously initialized for the same URL - return; - } - this.modelUrl = modelUrl; - //if NER model is available and lucene-geo-gazetteer is available - this.available = modelUrl != null && - ExternalParser.check(new String[] { "lucene-geo-gazetteer", "--help" }, -1); - if (this.available) { - try { - this.extractor = new NameEntityExtractor(modelUrl); - } catch (Exception e) { - e.printStackTrace(); - this.available = false; - } - } - initialized = true; - - } - - @Override - public void parse(InputStream stream, ContentHandler handler, - Metadata metadata, ParseContext context) throws IOException, - SAXException, TikaException { - - /*configure this parser by ParseContext Object-*/ - - this.config = context.get(GeoParserConfig.class, config); - initialize(this.config.getNerModelUrl()); - if (!isAvailable()) { - return; - } - - /*get locationNameEntities and best nameEntity for the input stream-*/ - extractor.getAllNameEntitiesfromInput(stream); - extractor.getBestNameEntity(); - ArrayList locationNameEntities = extractor.locationNameEntities; - String bestner = extractor.bestNameEntity; - - /*resolve geonames for each ner, store results in a hashmap-*/ - HashMap<String, ArrayList> resolvedGeonames = searchGeoNames(locationNameEntities); - - /*store locationNameEntities and their geonames in a geotag, each input has one geotag-*/ - GeoTag geotag = new GeoTag(); - geotag.toGeoTag(resolvedGeonames, bestner); - - /* add resolved entities in metadata */ - - metadata.add("Geographic_NAME", geotag.Geographic_NAME); - metadata.add("Geographic_LONGITUDE", geotag.Geographic_LONGTITUDE); - metadata.add("Geographic_LATITUDE", geotag.Geographic_LATITUDE); - for (int i = 0; i < geotag.alternatives.size(); ++i) { - GeoTag alter = (GeoTag) geotag.alternatives.get(i); - metadata.add("Optional_NAME" + (i + 1), alter.Geographic_NAME); - metadata.add("Optional_LONGITUDE" + (i + 1), - alter.Geographic_LONGTITUDE); - meta
svn commit: r1714494 - /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
Author: nick Date: Sun Nov 15 20:01:18 2015 New Revision: 1714494 URL: http://svn.apache.org/viewvc?rev=1714494=rev Log: TIKA-1791 Comments and logging Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java?rev=1714494=1714493=1714494=diff == --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java Sun Nov 15 20:01:18 2015 @@ -71,24 +71,25 @@ public class GeoParser extends AbstractP */ public void initialize(URL modelUrl) { if (this.modelUrl != null && this.modelUrl.equals(modelUrl)) { -// Previously initialized for the same URL +// Previously initialized for the same URL, no initialization needed return; } this.modelUrl = modelUrl; -//if NER model is available and lucene-geo-gazetteer is available -this.available = modelUrl != null && -ExternalParser.check(new String[] { "lucene-geo-gazetteer", "--help" }, -1); + +// Check if the NER model is available, and if the +// lucene-geo-gazetteer is available +this.available = modelUrl != null && ExternalParser.check( +new String[] { "lucene-geo-gazetteer", "--help" }, -1); if (this.available) { try { this.extractor = new NameEntityExtractor(modelUrl); } catch (Exception e) { -e.printStackTrace(); +LOG.warning("Named Entity Extractor setup failed: " + e); this.available = false; } } initialized = true; - } @Override @@ -126,9 +127,9 @@ public class GeoParser extends AbstractP GeoTag alter = (GeoTag) geotag.alternatives.get(i); metadata.add("Optional_NAME" + (i + 1), alter.Geographic_NAME); metadata.add("Optional_LONGITUDE" + (i + 1), -alter.Geographic_LONGTITUDE); + alter.Geographic_LONGTITUDE); metadata.add("Optional_LATITUDE" + (i + 1), -alter.Geographic_LATITUDE); + alter.Geographic_LATITUDE); } } @@ -149,8 +150,7 @@ public class GeoParser extends AbstractP exec.setWatchdog(watchdog); PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream); exec.setStreamHandler(streamHandler); -int exitValue = exec.execute(cmdLine, -EnvironmentUtils.getProcEnvironment()); +int exitValue = exec.execute(cmdLine, EnvironmentUtils.getProcEnvironment()); String outputJson = outputStream.toString("UTF-8"); JSONArray json = (JSONArray) JSONValue.parse(outputJson); @@ -172,7 +172,6 @@ public class GeoParser extends AbstractP } return returnHash; - } public boolean isAvailable() {
svn commit: r1714495 - in /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic: GeoParserConfig.java NameEntityExtractor.java
Author: nick Date: Sun Nov 15 20:01:22 2015 New Revision: 1714495 URL: http://svn.apache.org/viewvc?rev=1714495=rev Log: Fix inconsistent whitespace Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java?rev=1714495=1714494=1714495=diff == --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java Sun Nov 15 20:01:22 2015 @@ -23,34 +23,31 @@ import java.net.MalformedURLException; import java.net.URL; public class GeoParserConfig implements Serializable { +private static final long serialVersionUID = -3167692634278575818L; +private URL nerModelUrl = null; - private static final long serialVersionUID = 2L; - private URL nerModelUrl = null; - - public GeoParserConfig() { - this.nerModelUrl = GeoParserConfig.class.getResource("en-ner-location.bin"); - } - - public void setNERModelPath(String path) { - if (path == null) - return; - File file = new File(path); - if (file.isDirectory() || !file.exists()) { - return; - } - try { - this.nerModelUrl = file.toURI().toURL(); - } catch (MalformedURLException e) { - throw new RuntimeException(e); - } - } - - public void setNerModelUrl(URL url) { - this.nerModelUrl = url; - } - - public URL getNerModelUrl() { - return nerModelUrl; - } - +public GeoParserConfig() { +this.nerModelUrl = GeoParserConfig.class.getResource("en-ner-location.bin"); +} + +public void setNERModelPath(String path) { +if (path == null) +return; +File file = new File(path); +if (file.isDirectory() || !file.exists()) { +return; +} +try { +this.nerModelUrl = file.toURI().toURL(); +} catch (MalformedURLException e) { +throw new RuntimeException(e); +} +} + +public void setNerModelUrl(URL url) { +this.nerModelUrl = url; +} +public URL getNerModelUrl() { +return nerModelUrl; +} } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java?rev=1714495=1714494=1714495=diff == --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java Sun Nov 15 20:01:22 2015 @@ -37,93 +37,88 @@ import org.apache.commons.io.IOUtils; import static java.nio.charset.StandardCharsets.UTF_8; public class NameEntityExtractor { - - ArrayList locationNameEntities; - String bestNameEntity; - private HashMap<String, Integer> tf; - private final NameFinderME nameFinder; - - public NameEntityExtractor(URL modelUrl) throws IOException { - this.locationNameEntities = new ArrayList(); - this.bestNameEntity = null; - TokenNameFinderModel model = new TokenNameFinderModel(modelUrl); - this.nameFinder = new NameFinderME(model); - this.tf = new HashMap<String, Integer>(); - } - - /* -* Use OpenNLP to extract location names that's appearing in the steam. -* OpenNLP's default Name Finder accuracy is not very good, please refer to -* its documentation. -* -* @param stream stream that passed from this.parse() -*/ - - public void getAllNameEntitiesfromInput(InputStream stream) - throws IOException { - - - String[] in = IOUtils.toString(stream, UTF_8).split(" "); - Span nameE[]; - //name finder is not thread safe https://opennlp.apache.org/documentation/1.5.2-incubating/manual/opennlp.html#tools.namefind - synchronized (nameFinder) { - nameE = nameFinder.find(in); - //the same name finder is reused, so clear adaptive data -
svn commit: r1714496 - /tika/trunk/CHANGES.txt
Author: nick Date: Sun Nov 15 20:03:00 2015 New Revision: 1714496 URL: http://svn.apache.org/viewvc?rev=1714496=rev Log: Changelog update Modified: tika/trunk/CHANGES.txt Modified: tika/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1714496=1714495=1714496=diff == --- tika/trunk/CHANGES.txt (original) +++ tika/trunk/CHANGES.txt Sun Nov 15 20:03:00 2015 @@ -8,6 +8,8 @@ Release 1.12 - Current Development * Tika Facade parse methods for Path and File added which take a Metadata object, to mirror the existing InputStream one (GitHub-60) + * GeoParser fix for loading the NER model from a jar file (TIKA-1791) + Release 1.11 - 10/18/2015
svn commit: r1714341 - in /tika/site: publish/1.11/gettingstarted.html src/site/apt/1.11/gettingstarted.apt
Author: nick Date: Sat Nov 14 16:23:57 2015 New Revision: 1714341 URL: http://svn.apache.org/viewvc?rev=1714341=rev Log: Add Gradle and Ivy instructions Modified: tika/site/publish/1.11/gettingstarted.html tika/site/src/site/apt/1.11/gettingstarted.apt Modified: tika/site/publish/1.11/gettingstarted.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.11/gettingstarted.html?rev=1714341=1714340=1714341=diff == --- tika/site/publish/1.11/gettingstarted.html (original) +++ tika/site/publish/1.11/gettingstarted.html Sat Nov 14 16:23:57 2015 @@ -111,33 +111,45 @@ Tika bundle. An OSGi bundle that combines tika-parsers with non-OSGified parser libraries to make them easy to deploy in an OSGi environment. Using Tika as a Maven dependency -The core library, tika-core, contains the key interfaces and classes of Tika and can be used by itself if you don't need the full set of parsers from the tika-parsers component. The tika-core dependency looks like this: +The core library, tika-core , contains the key interfaces and classes of Tika and can be used by itself if you don't need the full set of parsers from the tika-parsers component. The tika-core dependency looks like this: dependency groupIdorg.apache.tika/groupId artifactIdtika-core/artifactId -version.../version +version1.11/version /dependency -If you want to use Tika to parse documents (instead of simply detecting document types, etc.), you'll want to depend on tika-parsers instead: +If you want to use Tika to parse documents (instead of simply detecting document types, etc.), you'll want to depend on tika-parsers instead: dependency groupIdorg.apache.tika/groupId artifactIdtika-parsers/artifactId -version.../version +version1.11/version /dependency Note that adding this dependency will introduce a number of transitive dependencies to your project, including one on tika-core. You need to make sure that these dependencies won't conflict with your existing project dependencies. You can use the following command in the tika-parsers directory to get a full listing of all the dependencies. $ mvn dependency:tree | grep :compile +Using Tika in a Gradle-built project +To add a dependency on Apache Tika to your Gradle built project, including the full set of parsers, you should depend on the tika-parsers artifact: + +dependencies { +runtime 'org.apache.tika:tika-parsers:1.11' +} + Using Tika in an Ant project -Unless you use a dependency manager tool like http://ant.apache.org/ivy/;>Apache Ivy, the easiest way to use Tika is to include either the tika-core or the tika-app jar in your classpath, depending on whether you want just the core functionality or also all the parser implementations. +If you are using http://ant.apache.org/ivy/;>Apache Ivy as your dependency manager tool with Ant, then to include Tika with the full set of parsers, you should depend on the tika-parsers artifact like this: + +dependencies +dependency org=org.apache.tika name=tika-parsers rev=1.11/ +/dependencies +Otherwise, probably the easiest way to use Tika is to include the full tika-app jar on your classpath. For just core functionality, you can add the tika-core jar, but be aware that the full set of parsers have a large number of dependencies which must be included which is very fiddly to do by hand with Ant! To include Tika in your Ant project, you should do something like: classpath ... !-- your other classpath entries -- - !-- either: -- + !-- either: Tika Core only, no parsers -- pathelement location=path/to/tika-core-${tika.version}.jar/ - !-- or: -- + !-- or: Tika with all Parsers-- pathelement location=path/to/tika-app-${tika.version}.jar/ /classpath Modified: tika/site/src/site/apt/1.11/gettingstarted.apt URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/1.11/gettingstarted.apt?rev=1714341=1714340=1714341=diff == --- tika/site/src/site/apt/1.11/gettingstarted.apt (original) +++ tika/site/src/site/apt/1.11/gettingstarted.apt Sat Nov 14 16:23:57 2015 @@ -71,26 +71,27 @@ Build artifacts Using Tika as a Maven dependency - The core library, tika-core, contains the key interfaces and classes of Tika - and can be used by itself if you don't need the full set of parsers from - the tika-parsers component. The tika-core dependency looks like this: + The core library, <<< tika-core >>>, contains the key interfaces and classes + of Tika and can be used by itself if you don't need the full set of parsers + from the <<< tika-parsers >>> component. The tika-core dependency looks like + this: --- org.apache.tika tika-core -... +1.11 --- If you want to use Tika to parse documents (instead
svn commit: r1714361 - /tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Author: nick Date: Sat Nov 14 20:25:59 2015 New Revision: 1714361 URL: http://svn.apache.org/viewvc?rev=1714361=rev Log: TIKA-1793 Add rfc822 email detection for common thunderbird message first headers Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1714361=1714360=1714361=diff == --- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original) +++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Sat Nov 14 20:25:59 2015 @@ -5157,6 +5157,8 @@ + + @@ -5167,6 +5169,7 @@ +
svn commit: r1713677 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/ tika-parsers/src/main/java/org/apache/tika/parser/pkg/ tika-parsers/src/test/java/org/apache/tika/detect/ tika
Author: nick Date: Tue Nov 10 16:18:45 2015 New Revision: 1713677 URL: http://svn.apache.org/viewvc?rev=1713677=rev Log: TIKA-1792 ASiC E and S mimetypes, detection and tests. Files and mimetype from Roberto Benedetti Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testASiCE.asice (with props) tika/trunk/tika-parsers/src/test/resources/test-documents/testASiCS.asics (with props) Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1713677=1713676=1713677=diff == --- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original) +++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Tue Nov 10 16:18:45 2015 @@ -992,6 +992,33 @@ + + +ASiC-E +<_comment>Extended Associated Signature Container + + + + + + + + + + + +ASiC-S +<_comment>Simple Associated Signature Container + + + + + + + + + + @@ -3834,6 +3861,7 @@ + Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java?rev=1713677=1713676=1713677=diff == --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java Tue Nov 10 16:18:45 2015 @@ -180,9 +180,9 @@ public class ZipContainerDetector implem } /** - * OpenDocument files, along with EPub files, have a mimetype - * entry in the root of their Zip file. This entry contains the - * mimetype of the overall file, stored as a single string. + * OpenDocument files, along with EPub files and ASiC ones, have a + * mimetype entry in the root of their Zip file. This entry contains + * the mimetype of the overall file, stored as a single string. */ private static MediaType detectOpenDocument(ZipFile zip) { try { Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1713677=1713676=1713677=diff == --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Tue Nov 10 16:18:45 2015 @@ -324,7 +324,15 @@ public class TestContainerAwareDetector public void testDetectIPA() throws Exception { assertTypeByNameAndData("testIPA.ipa", "application/x-itunes-ipa"); assertTypeByData("testIPA.ipa", "application/x-itunes-ipa"); - } +} + +@Test +public void testASiC() throws Exception { +assertTypeByData("testASiCE.asice", "application/vnd.etsi.asic-e+zip"); +assertTypeByData("testASiCS.asics", "application/vnd.etsi.asic-s+zip"); +assertTypeByNameAndData("testASiCE.asice", "application/vnd.etsi.asic-e+zip"); +assertTypeByNameAndData("testASiCS.asics", "application/vnd.etsi.asic-s+zip"); +} @Test public void testDetectZip() throws Exception { Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testASiCE.asice URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testASiCE.asice?rev=1713677=auto == Binary file - no diff available. Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testASiCE.asice -- svn:mime-type = application/octet-stream Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testASiCS.asics URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testASiCS.asics?rev=1713677=auto =
svn commit: r1713697 - /tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Author: nick Date: Tue Nov 10 16:58:04 2015 New Revision: 1713697 URL: http://svn.apache.org/viewvc?rev=1713697=rev Log: Tweak ASiC comment and priority based on feedback from the spec Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1713697=1713696=1713697=diff == --- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original) +++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Tue Nov 10 16:58:04 2015 @@ -997,8 +997,8 @@ ASiC-E <_comment>Extended Associated Signature Container - - + + @@ -1010,8 +1010,8 @@ ASiC-S <_comment>Simple Associated Signature Container - - + +
svn commit: r1711162 - in /tika/trunk: CHANGES.txt tika-core/src/main/java/org/apache/tika/Tika.java
Author: nick Date: Wed Oct 28 23:21:41 2015 New Revision: 1711162 URL: http://svn.apache.org/viewvc?rev=1711162=rev Log: Add Tika Facade parse methods for Path and File which take a Metadata object, to mirror the existing InputStream one. This closes #60 from GitHub Modified: tika/trunk/CHANGES.txt tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java Modified: tika/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1711162=1711161=1711162=diff == --- tika/trunk/CHANGES.txt (original) +++ tika/trunk/CHANGES.txt Wed Oct 28 23:21:41 2015 @@ -5,6 +5,9 @@ Release 1.12 - Current Development * Fix regression with spacing in PPT via Andreas Beeker (TIKA-1777). + * Tika Facade parse methods for Path and File added which take a +Metadata object, to mirror the existing InputStream one (GitHub-60) + Release 1.11 - 10/18/2015 Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java?rev=1711162=1711161=1711162=diff == --- tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java Wed Oct 28 23:21:41 2015 @@ -283,7 +283,8 @@ public class Tika { */ public String detect(File file) throws IOException { Metadata metadata = new Metadata(); -try (InputStream stream = TikaInputStream.get(file, metadata)) { +try (@SuppressWarnings("deprecation") +InputStream stream = TikaInputStream.get(file, metadata)) { return detect(stream, metadata); } } @@ -399,7 +400,7 @@ public class Tika { * the time when the {@link Reader#close()} method is called. * * @param stream the document to be parsed - * @param metadata document metadata + * @param metadata where document's metadata will be populated * @return extracted text content * @throws IOException if the document can not be read or parsed */ @@ -427,32 +428,62 @@ public class Tika { /** * Parses the file at the given path and returns the extracted text content. + * + * Metadata information extracted from the document is returned in + * the supplied metadata instance. * * @param path the path of the file to be parsed + * @param metadata where document's metadata will be populated * @return extracted text content * @throws IOException if the file can not be read or parsed */ -public Reader parse(Path path) throws IOException { -Metadata metadata = new Metadata(); +public Reader parse(Path path, Metadata metadata) throws IOException { InputStream stream = TikaInputStream.get(path, metadata); return parse(stream, metadata); } + +/** + * Parses the file at the given path and returns the extracted text content. + * + * @param path the path of the file to be parsed + * @return extracted text content + * @throws IOException if the file can not be read or parsed + */ +public Reader parse(Path path) throws IOException { +return parse(path, new Metadata()); +} /** * Parses the given file and returns the extracted text content. + * + * Metadata information extracted from the document is returned in + * the supplied metadata instance. * * @param file the file to be parsed + * @param metadata where document's metadata will be populated * @return extracted text content * @throws IOException if the file can not be read or parsed * @see #parse(Path) */ -public Reader parse(File file) throws IOException { -Metadata metadata = new Metadata(); +public Reader parse(File file, Metadata metadata) throws IOException { +@SuppressWarnings("deprecation") InputStream stream = TikaInputStream.get(file, metadata); return parse(stream, metadata); } /** + * Parses the given file and returns the extracted text content. + * + * @param file the file to be parsed + * @return extracted text content + * @throws IOException if the file can not be read or parsed + * @see #parse(Path) + */ +public Reader parse(File file) throws IOException { +return parse(file, new Metadata()); +} + +/** * Parses the resource at the given URL and returns the extracted * text content. * @@ -606,6 +637,7 @@ public class Tika { */ public String parseToString(File file) throws IOException, TikaException { Metadata metadata = new Metadata(); +@SuppressWarnings("deprecation") InputStream stream = TikaInputStream.get(file, metadata);
svn commit: r1710536 - in /tika/site/publish: 1.11/formats.html 1.12/ 1.12/examples.html 1.12/formats.html
Author: nick Date: Mon Oct 26 09:24:17 2015 New Revision: 1710536 URL: http://svn.apache.org/viewvc?rev=1710536=rev Log: Publish site changes Added: tika/site/publish/1.12/ tika/site/publish/1.12/examples.html tika/site/publish/1.12/formats.html Modified: tika/site/publish/1.11/formats.html Modified: tika/site/publish/1.11/formats.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.11/formats.html?rev=1710536=1710535=1710536=diff == --- tika/site/publish/1.11/formats.html (original) +++ tika/site/publish/1.11/formats.html Mon Oct 26 09:24:17 2015 @@ -208,7 +208,423 @@ The JackcessParser is able to extract metadata and content in a tabular form, from Microsoft Access database files. Full list of supported formats: -TODO Populate this at release time + +org.apache.tika.parser.asm.ClassParser + +application/java-vm +org.apache.tika.parser.audio.AudioParser + +audio/x-wav +audio/x-aiff +audio/basic +org.apache.tika.parser.audio.MidiParser + +application/x-midi +audio/midi +org.apache.tika.parser.chm.ChmParser + +application/vnd.ms-htmlhelp +application/chm +application/x-chm +org.apache.tika.parser.code.SourceCodeParser + +text/x-java-source +text/x-c++src +text/x-groovy +org.apache.tika.parser.crypto.Pkcs7Parser + +application/pkcs7-signature +application/pkcs7-mime +org.apache.tika.parser.dif.DIFParser + +application/dif+xml +org.apache.tika.parser.dwg.DWGParser + +image/vnd.dwg +org.apache.tika.parser.epub.EpubParser + +application/x-ibooks+zip +application/epub+zip +org.apache.tika.parser.executable.ExecutableParser + +application/x-elf +application/x-sharedlib +application/x-executable +application/x-msdownload +application/x-coredump +application/x-object +org.apache.tika.parser.external.ExternalParser + +video/mp4 +video/avi +video/mpeg +video/x-msvideo +org.apache.tika.parser.feed.FeedParser + +application/atom+xml +application/rss+xml +org.apache.tika.parser.font.AdobeFontMetricParser + +application/x-font-adobe-metric +org.apache.tika.parser.font.TrueTypeParser + +application/x-font-ttf +org.apache.tika.parser.gdal.GDALParser + +image/x-ozi +application/x-snodas +application/x-ecrg-toc +image/envisat +application/x-doq2 +application/x-rs2 +application/x-gsag +application/x-ers +application/fits +application/x-pnm +image/adrg +image/gif +application/x-generic-bin +application/x-bt +application/x-zmap +application/x-hdf +image/eir +application/x-ace2 +application/grass-ascii-grid +application/x-l1b +application/x-gsc +image/jp2 +image/hfa +image/fits +image/raster +application/x-epsilon +image/x-srp +application/x-envi-hdr +application/x-ctable2 +application/x-srtmhgt +application/jaxa-pal-sar +application/x-ndf +application/sdts-raster +application/x-gtx +application/x-rst +application/x-xyz +application/terragen +application/x-gs7bg +image/arg +application/elas +image/big-gif +application/x-geo-pdf +application/x-ctg +application/aaigrid +application/x-lcp +application/x-nwt-grc +application/x-fast +application/x-usgs-dem +application/x-nwt-grd +application/x-ingr +application/x-envi +application/x-rik +application/x-blx +application/x-wcs +image/ceos +application/x-ngs-geoid +application/x-r +image/bmp +application/x-http +application/x-til +application/x-pds +application/x-rasterlite +application/x-gmt +application/x-msgn +image/ilwis +application/aig +application/x-rmf +image/x-hdf5-image +image/sar-ceos +application/x-kro +application/vrt +application/x-netcdf +image/nitf +image/png +image/geotiff +image/x-mff2 +application/x-webp +image/ida +application/x-gsbg +application/x-ntv2 +application/x-coasp +application/x-los-las +application/x-tsx +application/x-bag +image/fit +application/x-lan +application/x-map +image/jpeg +application/x-dods +application/jdem +application/gff +application/x-isis2 +application/x-isis3 +application/xpm +application/x-pcidsk +application/x-gxf +application/x-wms +application/x-cosar +image/bsb +application/x-grib +application/x-mbtiles +application/x-cappi +application/x-rpf-toc +image/x-mff +image/x-dimap +image/x-pcraster +application/x-ppi +application/x-sdat +application/pcisdk +application/x-cpg +application/leveller +image/sgi +image/x-fujibas +image/x-airsar +application/x-e00-grid +application/x-kml +application/x-p-aux +application/x-doq1 +application/dted +application/x-dipex +org.apache.tika.parser.geo.topic.GeoParser + +application/geotopic +org.apache.tika.parser.geoinfo.GeographicInformationParser + +text/iso19139+xml +org.apache.tika.parser.grib.GribParser + +application/x-grib2 +org.apache.tika.parser.hdf.HDFParser + +application/x-hdf +org.apache.tika.parser.html.HtmlParser + +application/x-asp +application/xhtml+xml +application/vnd.wap.xhtml+xml +text/html +org.apache.tika.parser.image.BPGParser + +image/bpg +image/x-bpg +org.apache.tika.parser.image.ImageParser + +image/x-ms-bmp +image/png +image/x-icon +image/vnd.wap.wbmp +image/gif
svn commit: r1708950 - /tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG.jp2
Author: nick Date: Fri Oct 16 10:32:28 2015 New Revision: 1708950 URL: http://svn.apache.org/viewvc?rev=1708950=rev Log: Test JP2 (JPEG2000) file from Andreas Hirtzel from TIKA-1773 Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG.jp2 (with props) Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG.jp2 URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG.jp2?rev=1708950=auto == Binary file - no diff available. Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG.jp2 -- svn:mime-type = application/octet-stream
svn commit: r1708940 - /tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Author: nick Date: Fri Oct 16 10:04:47 2015 New Revision: 1708940 URL: http://svn.apache.org/viewvc?rev=1708940=rev Log: TIKA-1772 WebVTT mime entry from Alexander Widera Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1708940=1708939=1708940=diff == --- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original) +++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Fri Oct 16 10:04:47 2015 @@ -5538,6 +5538,13 @@ + +<_comment>Web Video Text Tracks Format +WebVTT + + + + <_comment>AWK script
svn commit: r1708975 - /tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Author: nick Date: Fri Oct 16 12:33:54 2015 New Revision: 1708975 URL: http://svn.apache.org/viewvc?rev=1708975=rev Log: JPEG2000 (jp2) detection tests Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1708975=1708974=1708975=diff == --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Fri Oct 16 12:33:54 2015 @@ -394,6 +394,10 @@ public class TestMimeTypes { assertTypeByName("image/jpeg", "x.jif"); assertTypeByName("image/jpeg", "x.jfif"); assertTypeByName("image/jpeg", "x.jfi"); + +assertType("image/jp2", "testJPEG.jp2"); +assertTypeByData("image/jp2", "testJPEG.jp2"); +assertTypeByName("image/jp2", "x.jp2"); } @Test
svn commit: r1705181 - /tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
Author: nick Date: Thu Sep 24 22:38:38 2015 New Revision: 1705181 URL: http://svn.apache.org/viewvc?rev=1705181=rev Log: Expand the Tika Config dumping support for parsers Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java?rev=1705181=1705180=1705181=diff == --- tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java (original) +++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java Thu Sep 24 22:38:38 2015 @@ -24,6 +24,7 @@ import java.io.OutputStreamWriter; import java.io.StringWriter; import java.io.Writer; import java.nio.charset.Charset; +import java.util.Collections; import java.util.List; import java.util.Set; import java.util.TreeSet; @@ -97,8 +98,8 @@ public class DumpTikaConfigExample { } private void addTranslator(Mode mode, Element rootElement, Document doc, TikaConfig config) { -// TikaConfig only reads the first translator from the list, -// but it looks like it expects a list +// Unlike the other entries, TikaConfig only wants one of +// these, and no outer list Translator translator = config.getTranslator(); if (mode == Mode.MINIMAL && translator instanceof DefaultTranslator) { Node mimeComment = doc.createComment( @@ -160,54 +161,65 @@ public class DumpTikaConfigExample { } else if (mode == Mode.MINIMAL) { mode = Mode.CURRENT; } -addParsers(mode, rootElement, doc, parser); + +Element parsersElement = doc.createElement("parsers"); +rootElement.appendChild(parsersElement); + +addParser(mode, parsersElement, doc, parser); } -private void addParsers(Mode mode, Element rootElement, Document doc, Parser parser) throws Exception { -Parser realParser = parser; +private void addParser(Mode mode, Element rootElement, Document doc, Parser parser) throws Exception { +// If the parser is decorated, is it a kind where we output the parser inside? +ParserDecorator decoration = null; if (parser instanceof ParserDecorator) { -realParser = ((ParserDecorator)parser).getWrappedParser(); +if (parser.getClass().getName().startsWith(ParserDecorator.class.getName()+"$")) { +decoration = ((ParserDecorator)parser); +parser = decoration.getWrappedParser(); +} } -List children = null; -if (mode == Mode.CURRENT && realParser instanceof DefaultParser) { -// Don't output any children -// TODO List excluded children -} else if (realParser instanceof CompositeParser) { -children = ((CompositeParser)realParser).getAllComponentParsers(); -if (realParser instanceof DefaultParser || parser == realParser) { -realParser = null; +boolean outputParser = true; +List children = Collections.emptyList(); +if (mode == Mode.CURRENT && parser instanceof DefaultParser) { +// Only output the parser, not the children +} else if (parser instanceof CompositeParser) { +children = ((CompositeParser)parser).getAllComponentParsers(); +// Special case for a naked composite +if (parser.getClass().equals(CompositeParser.class)) { +outputParser = false; +} +// Special case for making Default to static +if (mode == Mode.STATIC && parser instanceof DefaultParser) { +outputParser = false; } } -Element parsersElement = doc.createElement("parsers"); -rootElement.appendChild(parsersElement); -Element addParserTo = parsersElement; - -if (realParser != null) { -addParserTo = addParser(addParserTo, doc, parser, realParser); +if (outputParser) { +rootElement = addParser(rootElement, doc, parser, decoration); } -if (children != null && !children.isEmpty()) { -for (Parser p : children) { -addParser(addParserTo, doc, p, p); -} +for (Parser childParser : children) { +addParser(mode, rootElement, doc, childParser); } +// TODO Parser Exclusions } -private Element addParser(Element rootElement, Document doc, Parser parser, Parser realParser) throws Exception { +private Element addParser(Element rootElement, Document doc, Parser parser, ParserDecor
svn commit: r1705191 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java tika-core/src/main/java/org/apache/tika/config/TikaConfig.java tika-example/src/main/java/or
Author: nick Date: Thu Sep 24 22:59:15 2015 New Revision: 1705191 URL: http://svn.apache.org/viewvc?rev=1705191=rev Log: Expose the ServiceLoader used by TikaConfig, and use that to support serialising the service loader config xml section Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java?rev=1705191=1705190=1705191=diff == --- tika/trunk/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java Thu Sep 24 22:59:15 2015 @@ -46,6 +46,10 @@ public interface LoadErrorHandler { LoadErrorHandler IGNORE = new LoadErrorHandler() { public void handleLoadError(String classname, Throwable throwable) { } +@Override +public String toString() { +return "IGNORE"; +} }; /** @@ -57,6 +61,10 @@ public interface LoadErrorHandler { Logger.getLogger(classname).log( Level.WARNING, "Unable to load " + classname, throwable); } +@Override +public String toString() { +return "WARN"; +} }; /** @@ -68,6 +76,9 @@ public interface LoadErrorHandler { public void handleLoadError(String classname, Throwable throwable) { throw new RuntimeException("Unable to load " + classname, throwable); } +@Override +public String toString() { +return "THROW"; +} }; - } Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1705191=1705190=1705191=diff == --- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Thu Sep 24 22:59:15 2015 @@ -79,6 +79,7 @@ public class TikaConfig { private static Translator getDefaultTranslator(ServiceLoader loader) { return new DefaultTranslator(loader); } +private final ServiceLoader serviceLoader; private final CompositeParser parser; private final CompositeDetector detector; private final Translator translator; @@ -143,6 +144,7 @@ public class TikaConfig { this.detector = detectorLoader.loadOverall(element, mimeTypes, loader); this.parser = parserLoader.loadOverall(element, mimeTypes, loader); this.translator = translatorLoader.loadOverall(element, mimeTypes, loader); +this.serviceLoader = loader; } /** @@ -159,7 +161,7 @@ public class TikaConfig { */ public TikaConfig(ClassLoader loader) throws MimeTypeException, IOException { -ServiceLoader serviceLoader = new ServiceLoader(loader); +this.serviceLoader = new ServiceLoader(loader); this.mimeTypes = getDefaultMimeTypes(loader); this.detector = getDefaultDetector(mimeTypes, serviceLoader); this.parser = getDefaultParser(mimeTypes, serviceLoader); @@ -184,7 +186,7 @@ public class TikaConfig { * @throws TikaException if problem with MimeTypes or parsing XML config */ public TikaConfig() throws TikaException, IOException { -ServiceLoader loader = new ServiceLoader(); +this.serviceLoader = new ServiceLoader(); String config = System.getProperty("tika.config"); if (config == null) { @@ -193,9 +195,9 @@ public class TikaConfig { if (config == null) { this.mimeTypes = getDefaultMimeTypes(ServiceLoader.getContextClassLoader()); -this.parser = getDefaultParser(mimeTypes, loader); -this.detector = getDefaultDetector(mimeTypes, loader); -this.translator = getDefaultTranslator(loader); +this.parser = getDefaultParser(mimeTypes, serviceLoader); +this.detector = getDefaultDetector(mimeTypes, serviceLoader); +this.translator = getDefaultTranslator(serviceLoader); } else { // Locate the given configuration file InputStream stream = null; @@ -210,7 +212,7 @@ public class TikaConfig { } } if (stream == null) { -stream = loader.getResourceAsStream(config); +
svn commit: r1704934 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/language/translate/ tika-example/src/main/java/org/apache/tika/example/ tika-example/src/test/java/org/apache/tika/exampl
Author: nick Date: Wed Sep 23 21:04:08 2015 New Revision: 1704934 URL: http://svn.apache.org/viewvc?rev=1704934=rev Log: TIKA-1657 Update the example of dumping a Tika Config to support different output modes, for Translators and Detectors Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java?rev=1704934=1704933=1704934=diff == --- tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java Wed Sep 23 21:04:08 2015 @@ -99,6 +99,19 @@ public class DefaultTranslator implement } throw new TikaException("No translators currently available"); } + +/** + * Returns all available translators + */ +public List getTranslators() { +return getDefaultTranslators(loader); +} +/** + * Returns the current translator + */ +public Translator getTranslator() { +return getFirstAvailable(loader); +} public boolean isAvailable() { return getFirstAvailable(loader) != null; Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java?rev=1704934=1704933=1704934=diff == --- tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java (original) +++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java Wed Sep 23 21:04:08 2015 @@ -17,7 +17,8 @@ package org.apache.tika.example; -import java.io.File; +import static java.nio.charset.StandardCharsets.UTF_8; + import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; @@ -29,6 +30,7 @@ import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; + import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.transform.OutputKeys; @@ -38,6 +40,7 @@ import javax.xml.transform.dom.DOMSource import javax.xml.transform.stream.StreamResult; import org.apache.tika.config.TikaConfig; +import org.apache.tika.detect.CompositeDetector; import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; @@ -51,8 +54,6 @@ import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; -import static java.nio.charset.StandardCharsets.UTF_8; - /** * This class shows how to dump a TikaConfig object to a configuration file. @@ -70,21 +71,21 @@ public class DumpTikaConfigExample { * @param writer writer to which to write * @throws Exception */ -public void dump(TikaConfig config, Writer writer, String encoding) throws Exception { +public void dump(TikaConfig config, Mode mode, Writer writer, String encoding) throws Exception { DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder docBuilder = docFactory.newDocumentBuilder(); + // root elements Document doc = docBuilder.newDocument(); Element rootElement = doc.createElement("properties"); doc.appendChild(rootElement); -addMimeComment(rootElement, doc); -addTranslator(rootElement, doc, config); -addDetectors(rootElement, doc, config); -addParsers(rootElement, doc, config); +addMimeComment(mode, rootElement, doc); +addTranslator(mode, rootElement, doc, config); +addDetectors(mode, rootElement, doc, config); +addParsers(mode, rootElement, doc, config); - -//now write +// now write TransformerFactory transformerFactory = TransformerFactory.newInstance(); Transformer transformer = transformerFactory.newTransformer(); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); @@ -96,33 +97,50 @@ public class DumpTikaConfigExample { transformer.transform(source, result); } -private void addTranslator(Element rootElement, Document doc, TikaConfig config) { -//TikaConfig only reads the first translator from the list, -//but it looks like it ex
svn commit: r1701201 - /tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Author: nick Date: Fri Sep 4 09:56:49 2015 New Revision: 1701201 URL: http://svn.apache.org/r1701201 Log: TIKA-1728 Fix the HWP v5 mime type hierarchy Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1701201=1701200=1701201=diff == --- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original) +++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Fri Sep 4 09:56:49 2015 @@ -3242,7 +3242,7 @@ <_comment>Hangul Word Processor File v5 - +
svn commit: r1700984 - in /tika/trunk/tika-parsers/src/test/resources/test-documents: testHWP_3.0.hwp testHWP_5.0.hwp
Author: nick Date: Thu Sep 3 10:56:48 2015 New Revision: 1700984 URL: http://svn.apache.org/r1700984 Log: Test HWP files from Mungeol Heo from TIKA-1728 Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testHWP_3.0.hwp (with props) tika/trunk/tika-parsers/src/test/resources/test-documents/testHWP_5.0.hwp (with props) Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testHWP_3.0.hwp URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testHWP_3.0.hwp?rev=1700984=auto == Binary file - no diff available. Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testHWP_3.0.hwp -- svn:mime-type = application/octet-stream Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testHWP_5.0.hwp URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testHWP_5.0.hwp?rev=1700984=auto == Binary file - no diff available. Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testHWP_5.0.hwp -- svn:mime-type = application/octet-stream
svn commit: r1696817 - /tika/trunk/tika-bundle/pom.xml
Author: nick Date: Thu Aug 20 17:08:26 2015 New Revision: 1696817 URL: http://svn.apache.org/r1696817 Log: TIKA-1711 As Tika needs 1.7, remove 1.6 specific bits of the bundle build. Patch from Yaniv Kunda Modified: tika/trunk/tika-bundle/pom.xml Modified: tika/trunk/tika-bundle/pom.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-bundle/pom.xml?rev=1696817r1=1696816r2=1696817view=diff == --- tika/trunk/tika-bundle/pom.xml (original) +++ tika/trunk/tika-bundle/pom.xml Thu Aug 20 17:08:26 2015 @@ -387,56 +387,45 @@ skiptrue/skip /configuration /plugin -/plugins - /build - profiles -profile - idjava6/id - activation -jdk[1.6,)/jdk - /activation - build -plugins - plugin -artifactIdmaven-assembly-plugin/artifactId -executions - execution -phasepre-integration-test/phase -goals - goalsingle/goal -/goals -configuration - descriptortest-bundles.xml/descriptor - finalNametest/finalName - attachfalse/attach -/configuration - /execution -/executions - /plugin - plugin -artifactIdmaven-failsafe-plugin/artifactId -version2.10/version -executions - execution -goals - goalintegration-test/goal - goalverify/goal -/goals - /execution -/executions + plugin +artifactIdmaven-assembly-plugin/artifactId +executions + execution +phasepre-integration-test/phase +goals + goalsingle/goal +/goals configuration - systemPropertyVariables -org.ops4j.pax.logging.DefaultServiceLog.level - WARN -/org.ops4j.pax.logging.DefaultServiceLog.level - /systemPropertyVariables + descriptortest-bundles.xml/descriptor + finalNametest/finalName + attachfalse/attach /configuration - /plugin -/plugins - /build -/profile - /profiles + /execution +/executions + /plugin + + plugin +artifactIdmaven-failsafe-plugin/artifactId +version2.10/version +executions + execution +goals + goalintegration-test/goal + goalverify/goal +/goals + /execution +/executions +configuration + systemPropertyVariables +org.ops4j.pax.logging.DefaultServiceLog.level + WARN +/org.ops4j.pax.logging.DefaultServiceLog.level + /systemPropertyVariables +/configuration + /plugin +/plugins + /build organization nameThe Apache Software Founation/name
svn commit: r1696746 - in /tika/trunk/tika-parsers/src/test/java/org/apache/tika: embedder/ mime/ parser/ parser/chm/ parser/code/ parser/geo/topic/ parser/html/ parser/image/ parser/jdbc/ parser/mail
Author: nick Date: Thu Aug 20 09:59:17 2015 New Revision: 1696746 URL: http://svn.apache.org/r1696746 Log: TIKA-1710 patch from Yaniv Kunda - Use Commons IO instead of the Tika Core IO copies, and java.nio.charset.StandardCharsets Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmItspHeader.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxState.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxcControlData.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxcResetTable.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestParameters.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestPmglHeader.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/WebPParserTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mock/MockParserTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/MpegStreamTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/Latin1StringsParserTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java?rev=1696746r1=1696745r2=1696746view=diff == --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java Thu Aug 20 09:59:17 2015 @@ -16,6 +16,7 @@ */ package org.apache.tika.embedder; +import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -38,7 +39,6 @@ import java.util.Locale; import java.util.Map; import org.apache.tika.exception.TikaException; -import org.apache.tika.io.IOUtils; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -59,7 +59,7 @@ public class ExternalEmbedderTest { protected static final DateFormat EXPECTED_METADATA_DATE_FORMATTER = new SimpleDateFormat(-MM-dd'T'HH:mm:ss, Locale.ROOT); -protected static final String DEFAULT_CHARSET = IOUtils.UTF_8.name(); +protected static final String DEFAULT_CHARSET = UTF_8.name(); private static final String COMMAND_METADATA_ARGUMENT_DESCRIPTION = dc:description; private static final String TEST_TXT_PATH = /test-documents/testTXT.txt; Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1696746r1=1696745r2=1696746view=diff == --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original) +++ tika/trunk/tika
svn commit: r1696749 - in /tika/trunk/tika-batch: ./ src/main/java/org/apache/tika/batch/ src/main/java/org/apache/tika/batch/fs/ src/main/java/org/apache/tika/batch/fs/strawman/ src/test/java/org/apa
Author: nick Date: Thu Aug 20 10:02:19 2015 New Revision: 1696749 URL: http://svn.apache.org/r1696749 Log: TIKA-1710 patch from Yaniv Kunda - Use Commons IO instead of the Tika Core IO copies, and java.nio.charset.StandardCharsets Modified: tika/trunk/tika-batch/pom.xml tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcess.java tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/Interrupter.java tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/BasicTikaFSConsumer.java tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/FSBatchProcessCLI.java tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/strawman/StrawManTikaAppDriver.java tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/CommandLineParserBuilderTest.java tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/RecursiveParserWrapperFSConsumerTest.java tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchDriverTest.java tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/FSBatchTestBase.java tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/HandlerBuilderTest.java tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/StringStreamGobbler.java Modified: tika/trunk/tika-batch/pom.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/pom.xml?rev=1696749r1=1696748r2=1696749view=diff == --- tika/trunk/tika-batch/pom.xml (original) +++ tika/trunk/tika-batch/pom.xml Thu Aug 20 10:02:19 2015 @@ -67,6 +67,11 @@ version${cli.version}/version /dependency dependency + groupIdcommons-io/groupId + artifactIdcommons-io/artifactId + version${commons.io.version}/version +/dependency +dependency groupIdorg.apache.tika/groupId artifactIdtika-core/artifactId version${project.version}/version @@ -85,12 +90,6 @@ artifactIdjunit/artifactId scopetest/scope /dependency -dependency - groupIdcommons-io/groupId - artifactIdcommons-io/artifactId - scopetest/scope - version${commons.io.version}/version -/dependency /dependencies build Modified: tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcess.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcess.java?rev=1696749r1=1696748r2=1696749view=diff == --- tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcess.java (original) +++ tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcess.java Thu Aug 20 10:02:19 2015 @@ -31,10 +31,10 @@ import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; -import org.apache.tika.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import static java.nio.charset.StandardCharsets.UTF_8; /** * This is the main processor class for a single process. @@ -134,7 +134,7 @@ public class BatchProcess implements Cal //System.err should be redirected to System.out PrintStream sysErr = System.err; try { -outputStreamWriter = new PrintStream(sysErr, true, IOUtils.UTF_8.toString()); +outputStreamWriter = new PrintStream(sysErr, true, UTF_8.toString()); } catch (IOException e) { throw new RuntimeException(Can't redirect streams); } Modified: tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java?rev=1696749r1=1696748r2=1696749view=diff == --- tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java (original) +++ tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java Thu Aug 20 10:02:19 2015 @@ -29,10 +29,12 @@ import java.util.ArrayList; import java.util.List; import java.util.Locale; -import org.apache.tika.io.IOUtils; +import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import static java.nio.charset.StandardCharsets.UTF_8; + public class BatchProcessDriverCLI { /** @@ -285,7 +287,7 @@ public class BatchProcessDriverCLI { private BufferedReader reader; private InterruptWatcher(InputStream is) { -reader = new BufferedReader(new InputStreamReader
svn commit: r1696745 [1/2] - in /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser: audio/ chm/accessor/ chm/core/ code/ crypto/ ctakes/ dif/ envi/ epub/ feed/ gdal/ geo/topic/ hdf/ html/ i
Author: nick Date: Thu Aug 20 09:51:44 2015 New Revision: 1696745 URL: http://svn.apache.org/r1696745 Log: TIKA-1710 patch from Yaniv Kunda - Use Commons IO instead of the Tika Core IO copies, and java.nio.charset.StandardCharsets Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/audio/MidiParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/crypto/Pkcs7Parser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/dif/DIFParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/gdal/GDALParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/hdf/HDFParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/isatab/ISATabUtils.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3DBParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
svn commit: r1696745 [2/2] - in /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser: audio/ chm/accessor/ chm/core/ code/ crypto/ ctakes/ dif/ envi/ epub/ feed/ gdal/ geo/topic/ hdf/ html/ i
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java?rev=1696745r1=1696744r2=1696745view=diff == --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java Thu Aug 20 09:51:44 2015 @@ -27,6 +27,7 @@ import java.io.UnsupportedEncodingExcept import java.util.Locale; import java.util.concurrent.atomic.AtomicInteger; +import org.apache.commons.io.FilenameUtils; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.DocumentEntry; import org.apache.poi.poifs.filesystem.DocumentInputStream; @@ -35,7 +36,6 @@ import org.apache.poi.poifs.filesystem.N import org.apache.poi.poifs.filesystem.Ole10Native; import org.apache.poi.poifs.filesystem.Ole10NativeException; import org.apache.poi.util.IOUtils; -import org.apache.tika.io.FilenameUtils; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.RTFMetadata; Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=1696745r1=1696744r2=1696745view=diff == --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java Thu Aug 20 09:51:44 2015 @@ -21,8 +21,8 @@ import java.io.InputStream; import java.util.Collections; import java.util.Set; +import org.apache.commons.io.input.TaggedInputStream; import org.apache.tika.exception.TikaException; -import org.apache.tika.io.TaggedInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsParser.java?rev=1696745r1=1696744r2=1696745view=diff == --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsParser.java Thu Aug 20 09:51:44 2015 @@ -30,7 +30,6 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import org.apache.tika.exception.TikaException; -import org.apache.tika.io.IOUtils; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -41,6 +40,8 @@ import org.apache.tika.sax.XHTMLContentH import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * Parser that uses the strings (or strings-alternative) command to find the * printable strings in a object, or other binary, file @@ -267,7 +268,7 @@ public class StringsParser extends Abstr int totalBytes = 0; try { - reader = new BufferedReader(new InputStreamReader(stream, IOUtils.UTF_8)); + reader = new BufferedReader(new InputStreamReader(stream, UTF_8)); int n = 0; while ((n = reader.read(buffer)) != -1) { @@ -320,7 +321,7 @@ public class StringsParser extends Abstr String fileOutput = null; try { - reader = new BufferedReader(new InputStreamReader(out, IOUtils.UTF_8)); + reader = new BufferedReader(new InputStreamReader(out, UTF_8)); fileOutput = reader.readLine(); } catch (IOException ioe) { Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=1696745r1=1696744r2=1696745view=diff == --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java Thu Aug 20 09:51:44 2015 @@ -22,10 +22,10 @@ import java.nio.charset.Charset; import java.util.Collections; import java.util.Set; +import org.apache.commons.io.input.CloseShieldInputStream; import
svn commit: r1696836 - in /tika/trunk: ./ tika-parent/ tika-parsers/src/main/java/org/apache/tika/parser/pkg/ tika-parsers/src/test/java/org/apache/tika/parser/pkg/
Author: nick Date: Thu Aug 20 18:31:15 2015 New Revision: 1696836 URL: http://svn.apache.org/r1696836 Log: TIKA-1718 Upgrade to Commons Compress 1.10, and fix various TODOs that this permits Modified: tika/trunk/CHANGES.txt tika/trunk/tika-parent/pom.xml tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java Modified: tika/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1696836r1=1696835r2=1696836view=diff == --- tika/trunk/CHANGES.txt (original) +++ tika/trunk/CHANGES.txt Thu Aug 20 18:31:15 2015 @@ -12,6 +12,9 @@ Release 1.11 - Current Development * Corrected Tika Config XML detector defintion explicit loading of MimeTypes (TIKA-1708) + * Upgraded to Commons Compress 1.10, which enables zlib compressed +archives support (TIKA-1718) + Release 1.10 - 8/1/2015 Modified: tika/trunk/tika-parent/pom.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-parent/pom.xml?rev=1696836r1=1696835r2=1696836view=diff == --- tika/trunk/tika-parent/pom.xml (original) +++ tika/trunk/tika-parent/pom.xml Thu Aug 20 18:31:15 2015 @@ -301,7 +301,7 @@ maven.compiler.source1.7/maven.compiler.source maven.compiler.target1.7/maven.compiler.target project.reporting.outputEncoding${project.build.sourceEncoding}/project.reporting.outputEncoding -commons.compress.version1.9/commons.compress.version +commons.compress.version1.10/commons.compress.version commons.io.version2.4/commons.io.version slf4j.version1.7.12/slf4j.version /properties Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java?rev=1696836r1=1696835r2=1696836view=diff == --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java Thu Aug 20 18:31:15 2015 @@ -58,11 +58,10 @@ public class CompressorParser extends Ab private static final MediaType GZIP_ALT = MediaType.application(x-gzip); private static final MediaType XZ = MediaType.application(x-xz); private static final MediaType PACK = MediaType.application(application/x-java-pack200); -// TODO Not yet supported by CompressorStreamFactory, see COMPRESS-316 private static final MediaType ZLIB = MediaType.application(zlib); private static final SetMediaType SUPPORTED_TYPES = -MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, XZ, PACK); +MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, XZ, PACK, ZLIB); static MediaType getMediaType(CompressorInputStream stream) { // TODO Add support for the remaining CompressorInputStream formats: @@ -103,14 +102,14 @@ public class CompressorParser extends Ab CompressorInputStream cis; try { -CompressorStreamFactory factory = new CompressorStreamFactory(); CompressorParserOptions options = context.get(CompressorParserOptions.class, new CompressorParserOptions() { public boolean decompressConcatenated(Metadata metadata) { return false; } }); - factory.setDecompressConcatenated(options.decompressConcatenated(metadata)); +CompressorStreamFactory factory = +new CompressorStreamFactory(options.decompressConcatenated(metadata)); cis = factory.createCompressorInputStream(stream); } catch (CompressorException e) { throw new TikaException(Unable to uncompress document stream, e); Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=1696836r1=1696835r2=1696836view=diff == --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java Thu Aug 20 18:31:15 2015 @@ -24,6 +24,7 @@ import java.io.InputStream; import java.util.Date; import java.util.Set; +import org.apache.commons.compress.PasswordRequiredException; import org.apache.commons.compress.archivers.ArchiveEntry; import
svn commit: r1696833 - in /tika/trunk: tika-batch/pom.xml tika-parent/pom.xml tika-parsers/pom.xml
Author: nick Date: Thu Aug 20 18:08:44 2015 New Revision: 1696833 URL: http://svn.apache.org/r1696833 Log: TIKA-1718 Enforce a consistent commons compress version between components Modified: tika/trunk/tika-batch/pom.xml tika/trunk/tika-parent/pom.xml tika/trunk/tika-parsers/pom.xml Modified: tika/trunk/tika-batch/pom.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/pom.xml?rev=1696833r1=1696832r2=1696833view=diff == --- tika/trunk/tika-batch/pom.xml (original) +++ tika/trunk/tika-batch/pom.xml Thu Aug 20 18:08:44 2015 @@ -36,9 +36,6 @@ properties cli.version1.2/cli.version -!-- sync version with tika-server or move to parent? -- -compress.version1.9/compress.version -!-- sync with tika-parsers or move to parent? -- /properties dependencies @@ -55,7 +52,7 @@ dependency groupIdorg.apache.commons/groupId artifactIdcommons-compress/artifactId - version${compress.version}/version + version${commons.compress.version}/version /dependency dependency groupIdorg.slf4j/groupId Modified: tika/trunk/tika-parent/pom.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-parent/pom.xml?rev=1696833r1=1696832r2=1696833view=diff == --- tika/trunk/tika-parent/pom.xml (original) +++ tika/trunk/tika-parent/pom.xml Thu Aug 20 18:08:44 2015 @@ -301,6 +301,7 @@ maven.compiler.source1.7/maven.compiler.source maven.compiler.target1.7/maven.compiler.target project.reporting.outputEncoding${project.build.sourceEncoding}/project.reporting.outputEncoding +commons.compress.version1.9/commons.compress.version commons.io.version2.4/commons.io.version slf4j.version1.7.12/slf4j.version /properties Modified: tika/trunk/tika-parsers/pom.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1696833r1=1696832r2=1696833view=diff == --- tika/trunk/tika-parsers/pom.xml (original) +++ tika/trunk/tika-parsers/pom.xml Thu Aug 20 18:08:44 2015 @@ -36,11 +36,10 @@ properties poi.version3.13-beta1/poi.version +!-- NOTE: sync codec version with POI -- codec.version1.9/codec.version -!-- NOTE: sync with POI -- -compress.version1.9/compress.version +!-- NOTE: sync tukaani version with commons-compress -- tukaani.version1.5/tukaani.version -!-- NOTE: sync with commons-compress -- mime4j.version0.7.2/mime4j.version vorbis.version0.6/vorbis.version pdfbox.version1.8.10/pdfbox.version @@ -121,7 +120,7 @@ dependency groupIdorg.apache.commons/groupId artifactIdcommons-compress/artifactId - version${compress.version}/version + version${commons.compress.version}/version /dependency dependency groupIdorg.tukaani/groupId
svn commit: r1696856 - /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
Author: nick Date: Thu Aug 20 21:11:19 2015 New Revision: 1696856 URL: http://svn.apache.org/r1696856 Log: One more format to add support for Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java?rev=1696856r1=1696855r2=1696856view=diff == --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java Thu Aug 20 21:11:19 2015 @@ -71,6 +71,7 @@ public class CompressorParser extends Ab static MediaType getMediaType(CompressorInputStream stream) { // TODO Add support for the remaining CompressorInputStream formats: // LZMACompressorInputStream +// LZWInputStream - UnshrinkingInputStream if (stream instanceof BZip2CompressorInputStream) { return BZIP2; } else if (stream instanceof GzipCompressorInputStream) {
svn commit: r1696862 - /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
Author: nick Date: Thu Aug 20 21:46:28 2015 New Revision: 1696862 URL: http://svn.apache.org/r1696862 Log: Bring in line with other parsers with special InputStream requirements, by using TikaInputStream TIKA-1710 Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java?rev=1696862r1=1696861r2=1696862view=diff == --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java Thu Aug 20 21:46:28 2015 @@ -21,11 +21,11 @@ import java.io.InputStream; import java.util.Collections; import java.util.Set; -import org.apache.commons.io.input.TaggedInputStream; import org.apache.james.mime4j.MimeException; import org.apache.james.mime4j.parser.MimeStreamParser; import org.apache.james.mime4j.stream.MimeConfig; import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; @@ -44,7 +44,6 @@ import org.xml.sax.SAXException; * @author jnio...@digitalpebble.com */ public class RFC822Parser extends AbstractParser { - /** * Serial version UID */ @@ -73,13 +72,12 @@ public class RFC822Parser extends Abstra xhtml, metadata, context, config.isStrictParsing()); parser.setContentHandler(mch); parser.setContentDecoding(true); -TaggedInputStream tagged = stream instanceof TaggedInputStream -? (TaggedInputStream)stream -: new TaggedInputStream(stream); + +TikaInputStream tstream = TikaInputStream.get(stream); try { -parser.parse(tagged); +parser.parse(tstream); } catch (IOException e) { -tagged.throwIfCauseOf(e); +tstream.throwIfCauseOf(e); throw new TikaException(Failed to parse an email message, e); } catch (MimeException e) { // Unwrap the exception in case it was not thrown by mime4j
svn commit: r1696859 - /tika/trunk/tika-parsers/pom.xml
Author: nick Date: Thu Aug 20 21:38:45 2015 New Revision: 1696859 URL: http://svn.apache.org/r1696859 Log: TIKA-1710 Guava is no longer required, we have StandardCharsets instead now Modified: tika/trunk/tika-parsers/pom.xml Modified: tika/trunk/tika-parsers/pom.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1696859r1=1696858r2=1696859view=diff == --- tika/trunk/tika-parsers/pom.xml (original) +++ tika/trunk/tika-parsers/pom.xml Thu Aug 20 21:38:45 2015 @@ -327,11 +327,6 @@ artifactIdhttpservices/artifactId version${netcdf-java.version}/version /dependency -dependency - groupIdcom.google.guava/groupId - artifactIdguava/artifactId - version11.0.2/version -/dependency !-- Apache Commons CSV -- dependency groupIdorg.apache.commons/groupId
svn commit: r1696860 - /tika/trunk/CHANGES.txt
Author: nick Date: Thu Aug 20 21:40:12 2015 New Revision: 1696860 URL: http://svn.apache.org/r1696860 Log: Changelog update Modified: tika/trunk/CHANGES.txt Modified: tika/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1696860r1=1696859r2=1696860view=diff == --- tika/trunk/CHANGES.txt (original) +++ tika/trunk/CHANGES.txt Thu Aug 20 21:40:12 2015 @@ -12,6 +12,10 @@ Release 1.11 - Current Development * Corrected Tika Config XML detector defintion explicit loading of MimeTypes (TIKA-1708) + * In Tika Parsers, Batch, Server, App and Examples, use Apache +Commons IO instead of inlined ex-Commons classes, and the Java 7 +Standard Charset definitions (TIKA-1710) + * Upgraded to Commons Compress 1.10, which enables zlib compressed archives support (TIKA-1718)
svn commit: r1696609 - in /tika/site/src/site/apt: 1.10/configuring.apt 1.11/configuring.apt
Author: nick Date: Wed Aug 19 15:04:22 2015 New Revision: 1696609 URL: http://svn.apache.org/r1696609 Log: Fix APT markup for service loader config documentation Modified: tika/site/src/site/apt/1.10/configuring.apt tika/site/src/site/apt/1.11/configuring.apt Modified: tika/site/src/site/apt/1.10/configuring.apt URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/1.10/configuring.apt?rev=1696609r1=1696608r2=1696609view=diff == --- tika/site/src/site/apt/1.10/configuring.apt (original) +++ tika/site/src/site/apt/1.10/configuring.apt Wed Aug 19 15:04:22 2015 @@ -139,6 +139,9 @@ Configuring Tika While the work on that is ongoing, for now you will need to review the {{{./api/}Tika Javadocs}} to see how individual Translators are configured. +~~ When Translators can have their parameters configured, mention here about +~~ specifying which single one to use in the Tika Config XML + * {Configuring the Service Loader} Tika has a number of service provider types such as parsers, detectors, and translators. @@ -149,12 +152,14 @@ Configuring Tika The ServiceLoader's registry can be populated either statically or dynamically. -Static +** Static + Static loading is the default which requires no configuration. This configuration options is used in Tika deployments where the Tika JAR files reside together in the same classloader hierarchy. The services provides are loaded from provider configuration files located within the tika-parsers JAR file at META-INF/services. -Dynamic +** Dynamic + Dynamic loading may be required if the tika service providers will reside in different classloaders such as in OSGi. To allow a provider created in tika-config.xml to utilize dynamically loaded services you need to configure the ServiceLoader to be dynamic with the following configuration: @@ -166,15 +171,22 @@ Configuring Tika /properties --- +** Load Error Handling + The ServiceLoader can contains a handler to deal with errors that occur during provider initialization. For example if a class fails to initialize LoadErrorHandler deals with the exception that is thrown. This handler can be configured to: -IGNORE - (Default) Do nothing when providers fail to initialize. -WARN - Log a warning when providers fail to initialize. -THROW - Throw an exception when providers fail to initialize. +* IGNORE - (Default) Do nothing when providers fail to initialize. + +* WARN- Log a warning when providers fail to initialize. + +* THROW - Throw an exception when providers fail to initialize. + +[] + +For example to set the LoadErrorHandler to WARN then use the following configuration: -For example to set the LoadErrorHandler to WARN then use the following configuration: --- properties service-loader loadErrorHandler=WARN/ @@ -182,9 +194,6 @@ For example to set the LoadErrorHandler /properties --- -~~ When Translators can have their parameters configured, mention here about -~~ specifying which single one to use in the Tika Config XML - * {Using a Tika Configuration XML file} However you call Tika, the System Property of tika.config is Modified: tika/site/src/site/apt/1.11/configuring.apt URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/1.11/configuring.apt?rev=1696609r1=1696608r2=1696609view=diff == --- tika/site/src/site/apt/1.11/configuring.apt (original) +++ tika/site/src/site/apt/1.11/configuring.apt Wed Aug 19 15:04:22 2015 @@ -139,6 +139,9 @@ Configuring Tika While the work on that is ongoing, for now you will need to review the {{{./api/}Tika Javadocs}} to see how individual Translators are configured. +~~ When Translators can have their parameters configured, mention here about +~~ specifying which single one to use in the Tika Config XML + * {Configuring the Service Loader} Tika has a number of service provider types such as parsers, detectors, and translators. @@ -149,12 +152,14 @@ Configuring Tika The ServiceLoader's registry can be populated either statically or dynamically. -Static +** Static + Static loading is the default which requires no configuration. This configuration options is used in Tika deployments where the Tika JAR files reside together in the same classloader hierarchy. The services provides are loaded from provider configuration files located within the tika-parsers JAR file at META-INF/services. -Dynamic +** Dynamic + Dynamic loading may be required if the tika service providers will reside in different classloaders such as in OSGi. To allow a provider created in tika-config.xml to utilize dynamically loaded services you need to configure
svn commit: r1696610 - in /tika/site/publish: 1.10/configuring.html 1.11/configuring.html
Author: nick Date: Wed Aug 19 15:06:01 2015 New Revision: 1696610 URL: http://svn.apache.org/r1696610 Log: Republish the site Modified: tika/site/publish/1.10/configuring.html tika/site/publish/1.11/configuring.html Modified: tika/site/publish/1.10/configuring.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.10/configuring.html?rev=1696610r1=1696609r2=1696610view=diff == --- tika/site/publish/1.10/configuring.html (original) +++ tika/site/publish/1.10/configuring.html Wed Aug 19 15:06:01 2015 @@ -96,7 +96,12 @@ lia href=#Configuring_Mime_TypesConfiguring Mime Types/a/li lia href=#Configuring_Language_IdentifiersConfiguring Language Identifiers/a/li lia href=#Configuring_TranslatorsConfiguring Translators/a/li -lia href=#Configuring_the_Service_LoaderConfiguring the Service Loader/a/li/ul/li/ul +lia href=#Configuring_the_Service_LoaderConfiguring the Service Loader/a +ul +lia href=#StaticStatic/a/li +lia href=#DynamicDynamic/a/li +lia href=#Load_Error_HandlingLoad Error Handling/a/li/ul/li +lia href=#Using_a_Tika_Configuration_XML_fileUsing a Tika Configuration XML file/a/li/ul/li/ul div class=section h3a name=Configuring_ParsersConfiguring Parsers/a/h3 pThrough the Tika Config xml, it is possible to have a high degree of control over which parsers are or aren't used, in what order of preferences etc. It is also possible to override just certain parts, to (for example) have quot;default except for PDFquot;./p @@ -156,22 +161,35 @@ pAt this time, there is no unified way to configure language identifiers. While the work on that is ongoing, for now you will need to review the a href=./api/Tika Javadocs/a to see how individual identifiers are configured./p/div div class=section h3a name=Configuring_TranslatorsConfiguring Translators/a/h3 -pAt this time, there is no unified way to configure Translators. While the work on that is ongoing, for now you will need to review the a href=./api/Tika Javadocs/a to see how individual Translators are configured./p/div +pAt this time, there is no unified way to configure Translators. While the work on that is ongoing, for now you will need to review the a href=./api/Tika Javadocs/a to see how individual Translators are configured./p!-- When Translators can have their parameters configured, mention here about --!-- specifying which single one to use in the Tika Config XML --/div div class=section h3a name=Configuring_the_Service_LoaderConfiguring the Service Loader/a/h3 pTika has a number of service provider types such as parsers, detectors, and translators. The a href=./api/org/apache/tika/config/ServiceLoader.htmlorg.apache.tika.config.ServiceLoader/a class provides a registry of each type of provider. This allows Tika to create implementations such as a href=./api/org/apache/tika/parser/DefaultParser.htmlorg.apache.tika.parser.DefaultParser/a, a href=./api/org/apache/tika/language/translate/DefaultTranslator.htmlorg.apache.tika.language.translate.DefaultTranslator/a, and a href=./api/org/apache/tika/detect/DefaultDetector.htmlorg.apache.tika.detect.DefaultDetector/a that can match the appropriate provider to an incoming piece of content./p pThe ServiceLoader's registry can be populated either statically or dynamically./p -pStatic Static loading is the default which requires no configuration. This configuration options is used in Tika deployments where the Tika JAR files reside together in the same classloader hierarchy. The services provides are loaded from provider configuration files located within the tika-parsers JAR file at META-INF/services./p -pDynamic Dynamic loading may be required if the tika service providers will reside in different classloaders such as in OSGi. To allow a provider created in tika-config.xml to utilize dynamically loaded services you need to configure the ServiceLoader to be dynamic with the following configuration:/p +div class=section +h4Statica name=Static/a/h4 +pStatic loading is the default which requires no configuration. This configuration options is used in Tika deployments where the Tika JAR files reside together in the same classloader hierarchy. The services provides are loaded from provider configuration files located within the tika-parsers JAR file at META-INF/services./p/div +div class=section +h4Dynamica name=Dynamic/a/h4 +pDynamic loading may be required if the tika service providers will reside in different classloaders such as in OSGi. To allow a provider created in tika-config.xml to utilize dynamically loaded services you need to configure the ServiceLoader to be dynamic with the following configuration:/p div prelt;propertiesgt; lt;service-loader dynamic=quot;truequot;/gt; -lt;/propertiesgt;/pre/div -pThe ServiceLoader can contains a handler to deal with errors that occur during provider initialization. For example if a class fails to initialize
svn commit: r1696605 - in /tika/site/src/site/apt/1.11: ./ configuring.apt examples.apt formats.apt
Author: nick Date: Wed Aug 19 14:53:30 2015 New Revision: 1696605 URL: http://svn.apache.org/r1696605 Log: Start on the 1.11 docs, for the pieces that need updating during development Added: tika/site/src/site/apt/1.11/ tika/site/src/site/apt/1.11/configuring.apt tika/site/src/site/apt/1.11/examples.apt - copied unchanged from r1696597, tika/site/src/site/apt/1.10/examples.apt tika/site/src/site/apt/1.11/formats.apt - copied, changed from r1696597, tika/site/src/site/apt/1.10/formats.apt Added: tika/site/src/site/apt/1.11/configuring.apt URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/1.11/configuring.apt?rev=1696605view=auto == --- tika/site/src/site/apt/1.11/configuring.apt (added) +++ tika/site/src/site/apt/1.11/configuring.apt Wed Aug 19 14:53:30 2015 @@ -0,0 +1,214 @@ + + Configuring Tika + + +~~ Licensed to the Apache Software Foundation (ASF) under one or more +~~ contributor license agreements. See the NOTICE file distributed with +~~ this work for additional information regarding copyright ownership. +~~ The ASF licenses this file to You under the Apache License, Version 2.0 +~~ (the License); you may not use this file except in compliance with +~~ the License. You may obtain a copy of the License at +~~ +~~ http://www.apache.org/licenses/LICENSE-2.0 +~~ +~~ Unless required by applicable law or agreed to in writing, software +~~ distributed under the License is distributed on an AS IS BASIS, +~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +~~ See the License for the specific language governing permissions and +~~ limitations under the License. + +Configuring Tika + + Out of the box, Apache Tika will attempt to start with all available + Detectors and Parsers, running with sensible defaults. For most users, + this default configuration will work well. + + This page gives you information on how to configure the various + components of Apache Tika, such as Parsers and Detectors, if you need + fine-grained control over ordering, exclusions and the like. + +%{toc|section=1|fromDepth=1} + +* {Configuring Parsers} + +Through the Tika Config xml, it is possible to have a high degree of control +over which parsers are or aren't used, in what order of preferences etc. It +is also possible to override just certain parts, to (for example) have default +except for PDF. + +Currently, it is only possible to have a single parser run against a document. +There is on-going discussion around fallback parsers and combining the output +of multiple parsers running on a document, but none of these are available yet. + +To override some parser certain default behaviours, include the {{{ DefaultParser }}} +in your configuration, with excludes, then add other parser definitions in. +To prevent the {{{ DefaultParser }}} (with its auto-discovery) being used, +simply omit it from your config, and list all other parsers you want instead. + +To override just some default behaviour, you can use a Tika Config something +like this: + +--- +?xml version=1.0 encoding=UTF-8? +properties + parsers +!-- Default Parser for most things, except for 2 mime types, and never + use the Executable Parser -- +parser class=org.apache.tika.parser.DefaultParser + mime-excludeimage/jpeg/mime-exclude + mime-excludeapplication/pdf/mime-exclude + parser-exclude class=org.apache.tika.parser.executable.ExecutableParser/ +/parser +!-- Use a different parser for PDF -- +parser class=org.apache.tika.parser.EmptyParser + mimeapplication/pdf/mime +/parser + /parsers +/properties +--- + +To configure things in code, the key classes to use to build up your own custom +parser heirarchy are + {{{./api/org/apache/tika/parser/DefaultParser.html}org.apache.tika.parser.DefaultParser}}, + {{{./api/org/apache/tika/parser/CompositeParser.html}org.apache.tika.parser.CompositeParser}} +and + {{{./api/org/apache/tika/parser/ParserDecorator.html}org.apache.tika.parser.ParserDecorator}}. + +* {Configuring Detectors} + +Through the Tika Config xml, it is possible to have a high degree of control +over which detectors are or aren't used, in what order of preferences etc. It +is also possible to override just certain parts, to (for example) have default +except for no POIFS Container Detction. + +To override some detector certain default behaviours, include the +{{{ DefaultDetector }}}, with any {{{ detector-exclude }}} entries you need, +in your configuration, then add other detectors definitions in. To prevent +the {{{ DefaultParser }}} (with its auto-discovery) being used, simply omit it +from your config, and list all
svn commit: r1696322 - /tika/trunk/CHANGES.txt
Author: nick Date: Mon Aug 17 18:10:09 2015 New Revision: 1696322 URL: http://svn.apache.org/r1696322 Log: Changelog update Modified: tika/trunk/CHANGES.txt Modified: tika/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1696322r1=1696321r2=1696322view=diff == --- tika/trunk/CHANGES.txt (original) +++ tika/trunk/CHANGES.txt Mon Aug 17 18:10:09 2015 @@ -9,6 +9,9 @@ Release 1.11 - Current Development * Upgraded to ASM 5.0.4 (TIKA-1705). + * Corrected Tika Config XML detector defintion explicit loading +of MimeTypes (TIKA-1708) + Release 1.10 - 8/1/2015
svn commit: r1696159 - in /tika/trunk: tika-core/src/test/java/org/apache/tika/config/ tika-example/ tika-parsers/src/main/java/org/apache/tika/parser/mbox/ tika-parsers/src/test/java/org/apache/tika/
Author: nick Date: Sun Aug 16 18:00:57 2015 New Revision: 1696159 URL: http://svn.apache.org/r1696159 Log: Outlook detection with custom config tests, based on work by Justin Palmer TIKA-1708 Added: tika/trunk/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1708-detector-composite.xml tika/trunk/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1708-detector-default.xml Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java tika/trunk/tika-example/pom.xml tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java?rev=1696159r1=1696158r2=1696159view=diff == --- tika/trunk/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java (original) +++ tika/trunk/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java Sun Aug 16 18:00:57 2015 @@ -20,6 +20,7 @@ import static org.junit.Assert.assertNot import java.net.URL; +import org.apache.tika.TikaTest; import org.apache.tika.parser.ParseContext; import org.junit.After; @@ -29,7 +30,7 @@ import org.junit.After; * that {@link TikaConfigTest} can't, do due to a need for the * full set of real classes of parsers / detectors */ -public abstract class AbstractTikaConfigTest { +public abstract class AbstractTikaConfigTest extends TikaTest { protected static ParseContext context = new ParseContext(); protected static String getConfigPath(String config) throws Exception { Modified: tika/trunk/tika-example/pom.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/pom.xml?rev=1696159r1=1696158r2=1696159view=diff == --- tika/trunk/tika-example/pom.xml (original) +++ tika/trunk/tika-example/pom.xml Sun Aug 16 18:00:57 2015 @@ -64,6 +64,13 @@ /dependency dependency groupIdorg.apache.tika/groupId + artifactIdtika-core/artifactId + version${project.version}/version + typetest-jar/type + scopetest/scope +/dependency +dependency + groupIdorg.apache.tika/groupId artifactIdtika-parsers/artifactId version${project.version}/version typetest-jar/type Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java?rev=1696159r1=1696158r2=1696159view=diff == --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java Sun Aug 16 18:00:57 2015 @@ -46,14 +46,13 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; /** - * @author Tran Nam Quang - * @author hong-thai.nguyen + * Parser for MS Outlook PST email storage files */ public class OutlookPSTParser extends AbstractParser { private static final long serialVersionUID = 620998217748364063L; -private static final MediaType MS_OUTLOOK_PST_MIMETYPE = MediaType.application(vnd.ms-outlook-pst); +public static final MediaType MS_OUTLOOK_PST_MIMETYPE = MediaType.application(vnd.ms-outlook-pst); private static final SetMediaType SUPPORTED_TYPES = singleton(MS_OUTLOOK_PST_MIMETYPE); private static AttributesImpl createAttribute(String attName, String attValue) { Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java?rev=1696159r1=1696158r2=1696159view=diff == --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java Sun Aug 16 18:00:57 2015 @@ -25,8 +25,12 @@ import org.apache.tika.detect.CompositeD import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.Detector; import org.apache.tika.detect.EmptyDetector; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.mbox.OutlookPSTParser; import org.apache.tika.parser.microsoft.POIFSContainerDetector; import org.apache.tika.parser.pkg.ZipContainerDetector; +import
svn commit: r1696158 - in /tika/trunk: tika-core/src/test/java/org/apache/tika/TikaTest.java tika-parsers/src/test/java/org/apache/tika/TikaTest.java
Author: nick Date: Sun Aug 16 17:58:55 2015 New Revision: 1696158 URL: http://svn.apache.org/r1696158 Log: Move the parent test class of many Tika tests to core/test, so core tests can use it too Added: tika/trunk/tika-core/src/test/java/org/apache/tika/TikaTest.java - copied unchanged from r1696139, tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java Removed: tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
svn commit: r1696160 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/config/TikaConfig.java tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
Author: nick Date: Sun Aug 16 18:35:26 2015 New Revision: 1696160 URL: http://svn.apache.org/r1696160 Log: TIKA-1708 If the Tika Config detector entry calls for MimeTypes, use the already created one, avoid creating a new empty one Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1696160r1=1696159r2=1696160view=diff == --- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Sun Aug 16 18:35:26 2015 @@ -434,6 +434,8 @@ public class TikaConfig { abstract Class? extends T getLoaderClass(); // Generics workaround abstract boolean isComposite(T loaded); abstract boolean isComposite(Class? extends T loadedClass); +abstract T preLoadOne(Class? extends T loadedClass, String classname, +MimeTypes mimeTypes) throws TikaException; abstract CT createDefault(MimeTypes mimeTypes, ServiceLoader loader); abstract CT createComposite(ListT loaded, MimeTypes mimeTypes, ServiceLoader loader); abstract T createComposite(Class? extends T compositeClass, @@ -479,15 +481,11 @@ public class TikaConfig { try { Class? extends T loadedClass = loader.getServiceClass(getLoaderClass(), name); - -// Check for classes which can't be set in config -if (AutoDetectParser.class.isAssignableFrom(loadedClass)) { -// https://issues.apache.org/jira/browse/TIKA-866 -throw new TikaException( -AutoDetectParser not supported in a parser -+ configuration element: + name); -} +// Do pre-load checks and short-circuits +loaded = preLoadOne(loadedClass, name, mimeTypes); +if (loaded != null) return loaded; + // Is this a composite or decorated class? If so, support recursion if (isComposite(loadedClass)) { // Get the child objects for it @@ -562,6 +560,19 @@ public class TikaConfig { return Parser.class; } @Override +Parser preLoadOne(Class? extends Parser loadedClass, String classname, + MimeTypes mimeTypes) throws TikaException { +// Check for classes which can't be set in config +if (AutoDetectParser.class.isAssignableFrom(loadedClass)) { +// https://issues.apache.org/jira/browse/TIKA-866 +throw new TikaException( +AutoDetectParser not supported in a parser ++ configuration element: + classname); +} +// Continue with normal loading +return null; +} +@Override boolean isComposite(Parser loaded) { return loaded instanceof CompositeParser; } @@ -657,6 +668,17 @@ public class TikaConfig { return Detector.class; } @Override +Detector preLoadOne(Class? extends Detector loadedClass, String classname, +MimeTypes mimeTypes) throws TikaException { +// If they asked for the mime types as a detector, give +// them the one we've already created. TIKA-1708 +if (MimeTypes.class.equals(loadedClass)) { +return mimeTypes; +} +// Continue with normal loading +return null; +} +@Override boolean isComposite(Detector loaded) { return loaded instanceof CompositeDetector; } @@ -728,6 +750,12 @@ public class TikaConfig { return Translator.class; } @Override +Translator preLoadOne(Class? extends Translator loadedClass, String classname, + MimeTypes mimeTypes) throws TikaException { +// Continue with normal loading +return null; +} +@Override boolean isComposite(Translator loaded) { return false; } @Override boolean isComposite(Class? extends Translator loadedClass) { return false; } Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java?rev=1696160r1=1696159r2=1696160view=diff
svn commit: r1696054 - in /tika/trunk/tika-parsers: ./ src/main/java/org/apache/tika/parser/journal/ src/main/resources/META-INF/services/ src/main/resources/org/apache/tika/parser/journal/ src/test/j
Author: nick Date: Sat Aug 15 14:57:54 2015 New Revision: 1696054 URL: http://svn.apache.org/r1696054 Log: Back out r1695816, so the build can pass again, pending a fix of the broken grobid poms. Fix being tracked in TIKA-1699 Removed: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/ tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/journal/ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/journal/ tika/trunk/tika-parsers/src/test/resources/test-documents/testJournalParser.pdf Modified: tika/trunk/tika-parsers/pom.xml tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Modified: tika/trunk/tika-parsers/pom.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1696054r1=1696053r2=1696054view=diff == --- tika/trunk/tika-parsers/pom.xml (original) +++ tika/trunk/tika-parsers/pom.xml Sat Aug 15 14:57:54 2015 @@ -232,14 +232,6 @@ version0.7/version /dependency - !-- GROBID Dependencies -- - dependency - groupIdorg.grobid/groupId - artifactIdgrobid-core/artifactId - version0.3.4/version - /dependency - - !-- Provided dependencies -- dependency groupIdorg.xerial/groupId Modified: tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1696054r1=1696053r2=1696054view=diff == --- tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (original) +++ tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Sat Aug 15 14:57:54 2015 @@ -66,4 +66,3 @@ org.apache.tika.parser.isatab.ISArchiveP org.apache.tika.parser.geoinfo.GeographicInformationParser org.apache.tika.parser.geo.topic.GeoParser org.apache.tika.parser.external.CompositeExternalParser -org.apache.tika.parser.journal.JournalParser \ No newline at end of file
svn commit: r1694958 - /tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java
Author: nick Date: Mon Aug 10 06:14:43 2015 New Revision: 1694958 URL: http://svn.apache.org/r1694958 Log: Fix indents/whitespace Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java?rev=1694958r1=1694957r2=1694958view=diff == --- tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java (original) +++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java Mon Aug 10 06:14:43 2015 @@ -37,43 +37,41 @@ import org.xml.sax.ContentHandler; @SuppressWarnings(deprecation) public class MyFirstTika { - - public static void main(String[] args) throws Exception { - String filename = args[0]; - MimeTypes mimeRegistry = TikaConfig.getDefaultConfig() - .getMimeRepository(); - - System.out.println(Examining: [ + filename + ]); - - System.out.println(The MIME type (based on filename) is: [ - + mimeRegistry.getMimeType(filename) + ]); - - System.out.println(The MIME type (based on MAGIC) is: [ - + mimeRegistry.getMimeType(new File(filename)) + ]); - - Detector mimeDetector = (Detector) mimeRegistry; - System.out - .println(The MIME type (based on the Detector interface) is: [ - + mimeDetector.detect(new File(filename).toURI().toURL() - .openStream(), new Metadata()) + ]); - - LanguageIdentifier lang = new LanguageIdentifier(new LanguageProfile( - FileUtils.readFileToString(new File(filename; - - System.out.println(The language of this content is: [ - + lang.getLanguage() + ]); - - Parser parser = TikaConfig.getDefaultConfig().getParser( - MediaType.parse(mimeRegistry.getMimeType(filename).getName())); - Metadata parsedMet = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - parser.parse(new File(filename).toURI().toURL().openStream(), handler, - parsedMet, new ParseContext()); - - System.out.println(Parsed Metadata: ); - System.out.println(parsedMet); - System.out.println(Parsed Text: ); - System.out.println(handler.toString()); - - } +public static void main(String[] args) throws Exception { +String filename = args[0]; +MimeTypes mimeRegistry = TikaConfig.getDefaultConfig() +.getMimeRepository(); + +System.out.println(Examining: [ + filename + ]); + +System.out.println(The MIME type (based on filename) is: [ ++ mimeRegistry.getMimeType(filename) + ]); + +System.out.println(The MIME type (based on MAGIC) is: [ ++ mimeRegistry.getMimeType(new File(filename)) + ]); + +Detector mimeDetector = (Detector) mimeRegistry; +System.out +.println(The MIME type (based on the Detector interface) is: [ ++ mimeDetector.detect(new File(filename).toURI().toURL() +.openStream(), new Metadata()) + ]); + +LanguageIdentifier lang = new LanguageIdentifier(new LanguageProfile( +FileUtils.readFileToString(new File(filename; + +System.out.println(The language of this content is: [ ++ lang.getLanguage() + ]); + +Parser parser = TikaConfig.getDefaultConfig().getParser( +MediaType.parse(mimeRegistry.getMimeType(filename).getName())); +Metadata parsedMet = new Metadata(); +ContentHandler handler = new BodyContentHandler(); +parser.parse(new File(filename).toURI().toURL().openStream(), handler, +parsedMet, new ParseContext()); + +System.out.println(Parsed Metadata: ); +System.out.println(parsedMet); +System.out.println(Parsed Text: ); +System.out.println(handler.toString()); +} }
svn commit: r1694961 - /tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java
Author: nick Date: Mon Aug 10 06:24:57 2015 New Revision: 1694961 URL: http://svn.apache.org/r1694961 Log: Several people on StackOverflow are getting confused by this example, show how to use AutoDetectParser first, all the components second Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java?rev=1694961r1=1694960r2=1694961view=diff == --- tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java (original) +++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java Mon Aug 10 06:24:57 2015 @@ -19,11 +19,13 @@ import java.io.File; import org.apache.commons.io.FileUtils; import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.language.LanguageIdentifier; import org.apache.tika.language.LanguageProfile; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MimeTypes; +import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; @@ -33,14 +35,45 @@ import org.xml.sax.ContentHandler; * Demonstrates how to call the different components within Tika: its * {@link Detector} framework (aka MIME identification and repository), its * {@link Parser} interface, its {@link LanguageIdentifier} and other goodies. + * It also shows the easy way via {@link AutoDetectParser} */ @SuppressWarnings(deprecation) public class MyFirstTika { public static void main(String[] args) throws Exception { String filename = args[0]; -MimeTypes mimeRegistry = TikaConfig.getDefaultConfig() -.getMimeRepository(); +TikaConfig tikaConfig = TikaConfig.getDefaultConfig(); + +Metadata metadata = new Metadata(); +String text = parseUsingComponents(filename, tikaConfig, metadata); +System.out.println(Parsed Metadata: ); +System.out.println(metadata); +System.out.println(Parsed Text: ); +System.out.println(text); + +System.out.println(-); + +metadata = new Metadata(); +text = parseUsingAutoDetect(filename, tikaConfig, metadata); +System.out.println(Parsed Metadata: ); +System.out.println(metadata); +System.out.println(Parsed Text: ); +System.out.println(text); +} + +public static String parseUsingAutoDetect(String filename, TikaConfig tikaConfig, +Metadata metadata) throws Exception { +System.out.println(Handling using AutoDetectParser: [ + filename + ]); + +AutoDetectParser parser = new AutoDetectParser(tikaConfig); +ContentHandler handler = new BodyContentHandler(); +TikaInputStream stream = TikaInputStream.get(new File(filename)); +parser.parse(stream, handler, metadata, new ParseContext()); +return handler.toString(); +} +public static String parseUsingComponents(String filename, TikaConfig tikaConfig, +Metadata metadata) throws Exception { +MimeTypes mimeRegistry = tikaConfig.getMimeRepository(); System.out.println(Examining: [ + filename + ]); @@ -51,8 +84,7 @@ public class MyFirstTika { + mimeRegistry.getMimeType(new File(filename)) + ]); Detector mimeDetector = (Detector) mimeRegistry; -System.out -.println(The MIME type (based on the Detector interface) is: [ +System.out.println(The MIME type (based on the Detector interface) is: [ + mimeDetector.detect(new File(filename).toURI().toURL() .openStream(), new Metadata()) + ]); @@ -62,16 +94,12 @@ public class MyFirstTika { System.out.println(The language of this content is: [ + lang.getLanguage() + ]); -Parser parser = TikaConfig.getDefaultConfig().getParser( +Parser parser = tikaConfig.getParser( MediaType.parse(mimeRegistry.getMimeType(filename).getName())); -Metadata parsedMet = new Metadata(); ContentHandler handler = new BodyContentHandler(); parser.parse(new File(filename).toURI().toURL().openStream(), handler, -parsedMet, new ParseContext()); - -System.out.println(Parsed Metadata: ); -System.out.println(parsedMet); -System.out.println(Parsed Text: ); -System.out.println(handler.toString()); +metadata, new ParseContext()); + +return handler.toString(); } }
svn commit: r1694962 - /tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java
Author: nick Date: Mon Aug 10 06:33:51 2015 New Revision: 1694962 URL: http://svn.apache.org/r1694962 Log: Replace deprecated method use and outdated practice from the example Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java?rev=1694962r1=1694961r2=1694962view=diff == --- tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java (original) +++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java Mon Aug 10 06:33:51 2015 @@ -15,6 +15,7 @@ package org.apache.tika.example; import java.io.File; +import java.io.InputStream; import org.apache.commons.io.FileUtils; import org.apache.tika.config.TikaConfig; @@ -35,10 +36,9 @@ import org.xml.sax.ContentHandler; * Demonstrates how to call the different components within Tika: its * {@link Detector} framework (aka MIME identification and repository), its * {@link Parser} interface, its {@link LanguageIdentifier} and other goodies. + * * It also shows the easy way via {@link AutoDetectParser} */ - -@SuppressWarnings(deprecation) public class MyFirstTika { public static void main(String[] args) throws Exception { String filename = args[0]; @@ -77,16 +77,18 @@ public class MyFirstTika { System.out.println(Examining: [ + filename + ]); +metadata.set(Metadata.RESOURCE_NAME_KEY, filename); System.out.println(The MIME type (based on filename) is: [ -+ mimeRegistry.getMimeType(filename) + ]); ++ mimeRegistry.detect(null, metadata) + ]); +InputStream stream = TikaInputStream.get(new File(filename)); System.out.println(The MIME type (based on MAGIC) is: [ -+ mimeRegistry.getMimeType(new File(filename)) + ]); ++ mimeRegistry.detect(stream, metadata) + ]); -Detector mimeDetector = (Detector) mimeRegistry; +stream = TikaInputStream.get(new File(filename)); +Detector detector = tikaConfig.getDetector(); System.out.println(The MIME type (based on the Detector interface) is: [ -+ mimeDetector.detect(new File(filename).toURI().toURL() -.openStream(), new Metadata()) + ]); ++ detector.detect(stream, metadata) + ]); LanguageIdentifier lang = new LanguageIdentifier(new LanguageProfile( FileUtils.readFileToString(new File(filename; @@ -94,11 +96,14 @@ public class MyFirstTika { System.out.println(The language of this content is: [ + lang.getLanguage() + ]); -Parser parser = tikaConfig.getParser( -MediaType.parse(mimeRegistry.getMimeType(filename).getName())); +// Get a non-detecting parser that handles all the types it can +Parser parser = tikaConfig.getParser(); +// Tell it what we think the content is +MediaType type = detector.detect(stream, metadata); +metadata.set(Metadata.CONTENT_TYPE, type.toString()); +// Have the file parsed to get the content and metadata ContentHandler handler = new BodyContentHandler(); -parser.parse(new File(filename).toURI().toURL().openStream(), handler, -metadata, new ParseContext()); +parser.parse(stream, handler, metadata, new ParseContext()); return handler.toString(); }
svn commit: r1694974 - /tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java
Author: nick Date: Mon Aug 10 07:00:03 2015 New Revision: 1694974 URL: http://svn.apache.org/r1694974 Log: One more improvement Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java?rev=1694974r1=1694973r2=1694974view=diff == --- tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java (original) +++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java Mon Aug 10 07:00:03 2015 @@ -67,7 +67,7 @@ public class MyFirstTika { AutoDetectParser parser = new AutoDetectParser(tikaConfig); ContentHandler handler = new BodyContentHandler(); -TikaInputStream stream = TikaInputStream.get(new File(filename)); +TikaInputStream stream = TikaInputStream.get(new File(filename), metadata); parser.parse(stream, handler, metadata, new ParseContext()); return handler.toString(); }
svn commit: r1694584 - /tika/trunk/tika-parent/pom.xml
Author: nick Date: Thu Aug 6 23:02:23 2015 New Revision: 1694584 URL: http://svn.apache.org/r1694584 Log: Move to the most recent org.apache parent pom Modified: tika/trunk/tika-parent/pom.xml Modified: tika/trunk/tika-parent/pom.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-parent/pom.xml?rev=1694584r1=1694583r2=1694584view=diff == --- tika/trunk/tika-parent/pom.xml (original) +++ tika/trunk/tika-parent/pom.xml Thu Aug 6 23:02:23 2015 @@ -25,7 +25,7 @@ parent groupIdorg.apache/groupId artifactIdapache/artifactId -version10/version +version17/version relativePath / /parent
svn commit: r1694585 - /tika/trunk/tika-core/pom.xml
Author: nick Date: Thu Aug 6 23:07:26 2015 New Revision: 1694585 URL: http://svn.apache.org/r1694585 Log: More Tika Core rat excludes Modified: tika/trunk/tika-core/pom.xml Modified: tika/trunk/tika-core/pom.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/pom.xml?rev=1694585r1=1694584r2=1694585view=diff == --- tika/trunk/tika-core/pom.xml (original) +++ tika/trunk/tika-core/pom.xml Thu Aug 6 23:07:26 2015 @@ -85,6 +85,8 @@ configuration excludes excludesrc/test/resources/org/apache/tika/**/exclude + excludesrc/main/resources/org/apache/tika/language/*.ngp/exclude + excludesrc/main/resources/org/apache/tika/detect/*.nnmodel/exclude /excludes /configuration /plugin
svn commit: r1694587 - in /tika/trunk: tika-app/ tika-java7/ tika-server/ tika-translate/src/test/java/org/apache/tika/language/translate/
Author: nick Date: Thu Aug 6 23:18:34 2015 New Revision: 1694587 URL: http://svn.apache.org/r1694587 Log: License headers and Apache Rat excludes Modified: tika/trunk/tika-app/pom.xml tika/trunk/tika-java7/pom.xml tika/trunk/tika-server/pom.xml tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/Lingo24TranslatorTest.java tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/MosesTranslatorTest.java Modified: tika/trunk/tika-app/pom.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/pom.xml?rev=1694587r1=1694586r2=1694587view=diff == --- tika/trunk/tika-app/pom.xml (original) +++ tika/trunk/tika-app/pom.xml Thu Aug 6 23:18:34 2015 @@ -177,6 +177,15 @@ /execution /executions /plugin + plugin +groupIdorg.apache.rat/groupId +artifactIdapache-rat-plugin/artifactId +configuration + excludes +excludesrc/test/resources/test-data/**/exclude + /excludes +/configuration + /plugin /plugins /build Modified: tika/trunk/tika-java7/pom.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-java7/pom.xml?rev=1694587r1=1694586r2=1694587view=diff == --- tika/trunk/tika-java7/pom.xml (original) +++ tika/trunk/tika-java7/pom.xml Thu Aug 6 23:18:34 2015 @@ -60,6 +60,16 @@ /instructions /configuration /plugin + plugin +groupIdorg.apache.rat/groupId +artifactIdapache-rat-plugin/artifactId +configuration + excludes + excludesrc/main/resources/META-INF/services/java.nio.file.spi.FileTypeDetector/exclude +excludesrc/test/resources/test-documents/*/exclude + /excludes +/configuration + /plugin /plugins /build Modified: tika/trunk/tika-server/pom.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/pom.xml?rev=1694587r1=1694586r2=1694587view=diff == --- tika/trunk/tika-server/pom.xml (original) +++ tika/trunk/tika-server/pom.xml Thu Aug 6 23:18:34 2015 @@ -256,6 +256,17 @@ /executions /plugin plugin +groupIdorg.apache.rat/groupId +artifactIdapache-rat-plugin/artifactId +configuration + excludes +excludesrc/main/resources/tikaserver-version.properties/exclude +excludesrc/test/resources/*/exclude +excludeREADME.md/exclude + /excludes +/configuration + /plugin + plugin groupIdcom.qmino/groupId artifactIdmiredot-maven-plugin/artifactId version1.4/version Modified: tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/Lingo24TranslatorTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/Lingo24TranslatorTest.java?rev=1694587r1=1694586r2=1694587view=diff == --- tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/Lingo24TranslatorTest.java (original) +++ tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/Lingo24TranslatorTest.java Thu Aug 6 23:18:34 2015 @@ -14,7 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.tika.language.translate; import static org.junit.Assert.assertEquals; Modified: tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/MosesTranslatorTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/MosesTranslatorTest.java?rev=1694587r1=1694586r2=1694587view=diff == --- tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/MosesTranslatorTest.java (original) +++ tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/MosesTranslatorTest.java Thu Aug 6 23:18:34 2015 @@ -1,3 +1,19 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES
svn commit: r1693733 - /tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
Author: nick Date: Sat Aug 1 17:02:26 2015 New Revision: 1693733 URL: http://svn.apache.org/r1693733 Log: TIKA-1702 Move the parser and detector creation logic to the config loader classes Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1693733r1=1693732r2=1693733view=diff == --- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Sat Aug 1 17:02:26 2015 @@ -129,7 +129,7 @@ public class TikaConfig { DetectorXmlLoader detectorLoader = new DetectorXmlLoader(); this.mimeTypes = typesFromDomElement(element); -this.detector = detectorFromDomElement(element, mimeTypes, loader); +this.detector = detectorLoader.loadOverall(element, mimeTypes, loader); this.parser = parserLoader.loadOverall(element, mimeTypes, loader); this.translator = translatorFromDomElement(element, loader); } @@ -213,8 +213,7 @@ public class TikaConfig { this.mimeTypes = typesFromDomElement(element); this.parser = parserLoader.loadOverall(element, mimeTypes, loader); -this.detector = -detectorFromDomElement(element, mimeTypes, loader); +this.detector = detectorLoader.loadOverall(element, mimeTypes, loader); this.translator = translatorFromDomElement(element, loader); } catch (SAXException e) { throw new TikaException( @@ -358,137 +357,6 @@ public class TikaConfig { return getDefaultMimeTypes(null); } } - -//private static CompositeParser parserFromDomElement( -//Element element, MimeTypes mimeTypes, ServiceLoader loader) -//throws TikaException, IOException { -//ListParser parsers = new ArrayListParser(); -// -//// Find the parser children of the parsers tag, if any -//for (Element pe : getTopLevelElementChildren(element, parsers, parser)) { -//parsers.add(parserFromParserDomElement(pe, mimeTypes, loader)); -//} -// -//if (parsers.isEmpty()) { -//// No parsers defined, create a DefaultParser -//return getDefaultParser(mimeTypes, loader); -//} else if (parsers.size() == 1 parsers.get(0) instanceof CompositeParser) { -//// Single Composite defined, use that -//return (CompositeParser)parsers.get(0); -//} else { -//// Wrap the defined parsers up in a Composite -//MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry(); -//return new CompositeParser(registry, parsers); -//} -//} -private static Parser parserFromParserDomElement( -Element parserNode, MimeTypes mimeTypes, ServiceLoader loader) -throws TikaException, IOException { -String name = parserNode.getAttribute(class); -Parser parser = null; - -try { -Class? extends Parser parserClass = -loader.getServiceClass(Parser.class, name); -// https://issues.apache.org/jira/browse/TIKA-866 -if (AutoDetectParser.class.isAssignableFrom(parserClass)) { -throw new TikaException( -AutoDetectParser not supported in a parser -+ configuration element: + name); -} - -// Is this a composite or decorated parser? If so, support recursion -if (CompositeParser.class.isAssignableFrom(parserClass) || -ParserDecorator.class.isAssignableFrom(parserClass)) { - -// Get the child parsers for it -ListParser childParsers = new ArrayListParser(); -NodeList childParserNodes = parserNode.getElementsByTagName(parser); -if (childParserNodes.getLength() 0) { -for (int i = 0; i childParserNodes.getLength(); i++) { -childParsers.add(parserFromParserDomElement( -(Element)childParserNodes.item(i), mimeTypes, loader -)); -} -} - -// Get the list of parsers to exclude -SetClass? extends Parser excludeParsers = new HashSetClass? extends Parser(); -NodeList excludeParserNodes = parserNode.getElementsByTagName(parser-exclude); -if (excludeParserNodes.getLength() 0) { -for (int i = 0; i excludeParserNodes.getLength
svn commit: r1693747 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/config/ tika-core/src/main/java/org/apache/tika/language/translate/ tika-parsers/src/test/java/org/apache/tika/config/ ti
Author: nick Date: Sat Aug 1 17:53:53 2015 New Revision: 1693747 URL: http://svn.apache.org/r1693747 Log: Convert Translator config to the new pattern for TIKA-1702, and add unit tests for Translator xml config Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaTranslatorConfigTest.java tika/trunk/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1702-translator-default.xml tika/trunk/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty-default.xml tika/trunk/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty.xml Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1693747r1=1693746r2=1693747view=diff == --- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Sat Aug 1 17:53:53 2015 @@ -127,11 +127,12 @@ public class TikaConfig { throws TikaException, IOException { ParserXmlLoader parserLoader = new ParserXmlLoader(); DetectorXmlLoader detectorLoader = new DetectorXmlLoader(); +TranslatorXmlLoader translatorLoader = new TranslatorXmlLoader(); this.mimeTypes = typesFromDomElement(element); this.detector = detectorLoader.loadOverall(element, mimeTypes, loader); this.parser = parserLoader.loadOverall(element, mimeTypes, loader); -this.translator = translatorFromDomElement(element, loader); +this.translator = translatorLoader.loadOverall(element, mimeTypes, loader); } /** @@ -210,11 +211,12 @@ public class TikaConfig { Element element = getBuilder().parse(stream).getDocumentElement(); ParserXmlLoader parserLoader = new ParserXmlLoader(); DetectorXmlLoader detectorLoader = new DetectorXmlLoader(); +TranslatorXmlLoader translatorLoader = new TranslatorXmlLoader(); this.mimeTypes = typesFromDomElement(element); this.parser = parserLoader.loadOverall(element, mimeTypes, loader); this.detector = detectorLoader.loadOverall(element, mimeTypes, loader); -this.translator = translatorFromDomElement(element, loader); +this.translator = translatorLoader.loadOverall(element, mimeTypes, loader); } catch (SAXException e) { throw new TikaException( Specified Tika configuration has syntax errors: @@ -322,15 +324,24 @@ public class TikaConfig { } private static ListElement getTopLevelElementChildren(Element element, String parentName, String childrenName) throws TikaException { -// Should be only zero or one parsers / detectors etc tag -NodeList nodes = element.getElementsByTagName(parentName); -if (nodes.getLength() 1) { -throw new TikaException(Properties may not contain multiple +parentName+ entries); +Node parentNode = null; +if (parentName != null) { +// Should be only zero or one parsers / detectors etc tag +NodeList nodes = element.getElementsByTagName(parentName); +if (nodes.getLength() 1) { +throw new TikaException(Properties may not contain multiple +parentName+ entries); +} +else if (nodes.getLength() == 1) { +parentNode = nodes.item(0); +} +} else { +// All children directly on the master element +parentNode = element; } -else if (nodes.getLength() == 1) { + +if (parentNode != null) { // Find only the direct child parser/detector objects -Node parsersE = nodes.item(0); -nodes = parsersE.getChildNodes(); +NodeList nodes = parentNode.getChildNodes(); ListElement elements = new ArrayListElement(); for (int i = 0; i nodes.getLength(); i++) { Node node = nodes.item(i); @@ -383,39 +394,9 @@ public class TikaConfig { if (types != null) return types; return Collections.emptySet(); } - -private static Translator translatorFromDomElement( -Element element, ServiceLoader loader) -throws TikaException
svn commit: r1693716 [3/5] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.2/ publish/1.3/ pu
Modified: tika/site/publish/1.5/formats.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.5/formats.html?rev=1693716r1=1693715r2=1693716view=diff == --- tika/site/publish/1.5/formats.html (original) +++ tika/site/publish/1.5/formats.html Sat Aug 1 15:25:44 2015 @@ -204,6 +204,7 @@ + li class=expanded a href=../1.9/index.htmlApache Tika 1.9/a @@ -230,6 +231,10 @@ /li li class=none +a href=../1.9/configuring.htmlConfiguring Tika/a + /li + +li class=none a href=../1.9/examples.htmlUsage Examples/a /li Modified: tika/site/publish/1.5/gettingstarted.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.5/gettingstarted.html?rev=1693716r1=1693715r2=1693716view=diff == --- tika/site/publish/1.5/gettingstarted.html (original) +++ tika/site/publish/1.5/gettingstarted.html Sat Aug 1 15:25:44 2015 @@ -256,6 +256,7 @@ curl http://.../document.doc \ + li class=expanded a href=../1.9/index.htmlApache Tika 1.9/a @@ -282,6 +283,10 @@ curl http://.../document.doc \ /li li class=none +a href=../1.9/configuring.htmlConfiguring Tika/a + /li + +li class=none a href=../1.9/examples.htmlUsage Examples/a /li Modified: tika/site/publish/1.5/index.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.5/index.html?rev=1693716r1=1693715r2=1693716view=diff == --- tika/site/publish/1.5/index.html (original) +++ tika/site/publish/1.5/index.html Sat Aug 1 15:25:44 2015 @@ -203,6 +203,7 @@ + li class=expanded a href=../1.9/index.htmlApache Tika 1.9/a @@ -229,6 +230,10 @@ /li li class=none +a href=../1.9/configuring.htmlConfiguring Tika/a + /li + +li class=none a href=../1.9/examples.htmlUsage Examples/a /li Modified: tika/site/publish/1.5/parser.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.5/parser.html?rev=1693716r1=1693715r2=1693716view=diff == --- tika/site/publish/1.5/parser.html (original) +++ tika/site/publish/1.5/parser.html Sat Aug 1 15:25:44 2015 @@ -215,6 +215,7 @@ try { + li class=expanded a href=../1.9/index.htmlApache Tika 1.9/a @@ -241,6 +242,10 @@ try { /li li class=none +a href=../1.9/configuring.htmlConfiguring Tika/a + /li + +li class=none a href=../1.9/examples.htmlUsage Examples/a /li Modified: tika/site/publish/1.5/parser_guide.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.5/parser_guide.html?rev=1693716r1=1693715r2=1693716view=diff == --- tika/site/publish/1.5/parser_guide.html (original) +++ tika/site/publish/1.5/parser_guide.html Sat Aug 1 15:25:44 2015 @@ -216,6 +216,7 @@ public class HelloParser extends Abstrac + li class=expanded a href=../1.9/index.htmlApache Tika 1.9/a @@ -242,6 +243,10 @@ public class HelloParser extends Abstrac /li li class=none +a href=../1.9/configuring.htmlConfiguring Tika/a + /li + +li class=none a href=../1.9/examples.htmlUsage Examples/a /li Modified: tika/site/publish/1.6/detection.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.6/detection.html?rev=1693716r1=1693715r2=1693716view=diff == --- tika/site/publish/1.6/detection.html (original) +++ tika/site/publish/1.6/detection.html Sat Aug 1 15:25:44 2015 @@ -200,6 +200,7 @@ for (InputStream is : myListOfStreams) { + li
svn commit: r1693716 [5/5] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.2/ publish/1.3/ pu
Modified: tika/site/publish/1.9/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.9/examples.html?rev=1693716r1=1693715r2=1693716view=diff == --- tika/site/publish/1.9/examples.html (original) +++ tika/site/publish/1.9/examples.html Sat Aug 1 15:25:44 2015 @@ -116,41 +116,41 @@ pThe a href=./api/org/apache/tika/Tika.htmlTika facade/a, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text/pstyle type=text/css @import url('attached-includes/css/shCoreDefault.css'); /style -div id=highlighter_294673 class=syntaxhighlighter nogutter javatable border=0 cellpadding=0 cellspacing=0tbodytrtd class=codediv class=containerdiv class=line number49 index0 alt2code class=java keywordpublic/code code class=java plainString parseToStringExample() /codecode class=java keywordthrows/code code class=java plainIOException, SAXException, TikaException {/code/divdiv class=line number50 index1 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainInputStream stream = ParsingExample./codecode class=java keywordclass/codecode class=java plain.getResourceAsStream(/codecode class=java stringtest.doc/codecode class=java plain);/code/divdiv class=line number51 index2 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainTika tika = /codecode class=java keywordnew/code code class=java plainTika();/code/div div class=line number52 index3 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java keywordtry/code code class=java plain{/code/divdiv class=line number53 index4 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode class=java keywordreturn/code code class=java plaintika.parseToString(stream);/code/divdiv class=line number54 index5 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plain} /codecode class=java keywordfinally/code code class=java plain{/code/divdiv class=line number55 index6 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode class=java plainstream.close();/code/divdiv class=line number56 index7 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plain}/code/divdiv class=line number57 index8 alt2code class=java plain}/code/div/ div/td/tr/tbody/table/div/div +div id=highlighter_34225 class=syntaxhighlighter nogutter javatable border=0 cellpadding=0 cellspacing=0tbodytrtd class=codediv class=containerdiv class=line number53 index0 alt2code class=java keywordpublic/code code class=java plainString parseToStringExample() /codecode class=java keywordthrows/code code class=java plainIOException, SAXException, TikaException {/code/divdiv class=line number54 index1 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainInputStream stream = ParsingExample./codecode class=java keywordclass/codecode class=java plain.getResourceAsStream(/codecode class=java stringtest.doc/codecode class=java plain);/code/divdiv class=line number55 index2 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainTika tika = /codecode class=java keywordnew/code code class=java plainTika();/code/divd iv class=line number56 index3 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java keywordtry/code code class=java plain{/code/divdiv class=line number57 index4 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode class=java keywordreturn/code code class=java plaintika.parseToString(stream);/code/divdiv class=line number58 index5 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plain} /codecode class=java keywordfinally/code code class=java plain{/code/divdiv class=line number59 index6 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode class=java plainstream.close();/code/divdiv class=line number60 index7 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plain}/code/divdiv class=line number61 index8 alt2code class=java plain}/code/div/d iv/td/tr/tbody/table/div/div div class=section h4a name=Parsing_using_the_Auto-Detect_ParserParsing using the Auto-Detect Parser/a/h4 -pFor more control, you can call the a href=./api/org/apache/tika/parser/Parser.htmlTika Parsers/a directly. Most likely, you'll want to start out using the a href=./api/org/apache/tika/parser/AutoDetectParser.htmlAuto-Detect Parser/a, which automatically figures out what kind of content you have, then calls the appropriate parser for you./pdiv id=highlighter_420078 class=syntaxhighlighter nogutter javatable border=0 cellpadding=0 cellspacing=0tbodytrtd class=codediv class=containerdiv class=line number83 index0 alt2code class=java keywordpublic/code code class=java plainString parseExample() /codecode class=java keywordthrows/code code class=java plainIOException, SAXException, TikaException
svn commit: r1693716 [4/5] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.2/ publish/1.3/ pu
Modified: tika/site/publish/1.8/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.8/examples.html?rev=1693716r1=1693715r2=1693716view=diff == --- tika/site/publish/1.8/examples.html (original) +++ tika/site/publish/1.8/examples.html Sat Aug 1 15:25:44 2015 @@ -115,41 +115,41 @@ pThe a href=./api/org/apache/tika/Tika.htmlTika facade/a, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text/pstyle type=text/css @import url('attached-includes/css/shCoreDefault.css'); /style -div id=highlighter_489927 class=syntaxhighlighter nogutter javatable border=0 cellpadding=0 cellspacing=0tbodytrtd class=codediv class=containerdiv class=line number49 index0 alt2code class=java keywordpublic/code code class=java plainString parseToStringExample() /codecode class=java keywordthrows/code code class=java plainIOException, SAXException, TikaException {/code/divdiv class=line number50 index1 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainInputStream stream = ParsingExample./codecode class=java keywordclass/codecode class=java plain.getResourceAsStream(/codecode class=java stringtest.doc/codecode class=java plain);/code/divdiv class=line number51 index2 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainTika tika = /codecode class=java keywordnew/code code class=java plainTika();/code/div div class=line number52 index3 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java keywordtry/code code class=java plain{/code/divdiv class=line number53 index4 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode class=java keywordreturn/code code class=java plaintika.parseToString(stream);/code/divdiv class=line number54 index5 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plain} /codecode class=java keywordfinally/code code class=java plain{/code/divdiv class=line number55 index6 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode class=java plainstream.close();/code/divdiv class=line number56 index7 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plain}/code/divdiv class=line number57 index8 alt2code class=java plain}/code/div/ div/td/tr/tbody/table/div/div +div id=highlighter_823561 class=syntaxhighlighter nogutter javatable border=0 cellpadding=0 cellspacing=0tbodytrtd class=codediv class=containerdiv class=line number53 index0 alt2code class=java keywordpublic/code code class=java plainString parseToStringExample() /codecode class=java keywordthrows/code code class=java plainIOException, SAXException, TikaException {/code/divdiv class=line number54 index1 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainInputStream stream = ParsingExample./codecode class=java keywordclass/codecode class=java plain.getResourceAsStream(/codecode class=java stringtest.doc/codecode class=java plain);/code/divdiv class=line number55 index2 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainTika tika = /codecode class=java keywordnew/code code class=java plainTika();/code/div div class=line number56 index3 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java keywordtry/code code class=java plain{/code/divdiv class=line number57 index4 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode class=java keywordreturn/code code class=java plaintika.parseToString(stream);/code/divdiv class=line number58 index5 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plain} /codecode class=java keywordfinally/code code class=java plain{/code/divdiv class=line number59 index6 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode class=java plainstream.close();/code/divdiv class=line number60 index7 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plain}/code/divdiv class=line number61 index8 alt2code class=java plain}/code/div/ div/td/tr/tbody/table/div/div div class=section h4a name=Parsing_using_the_Auto-Detect_ParserParsing using the Auto-Detect Parser/a/h4 -pFor more control, you can call the a href=./api/org/apache/tika/parser/Parser.htmlTika Parsers/a directly. Most likely, you'll want to start out using the a href=./api/org/apache/tika/parser/AutoDetectParser.htmlAuto-Detect Parser/a, which automatically figures out what kind of content you have, then calls the appropriate parser for you./pdiv id=highlighter_934037 class=syntaxhighlighter nogutter javatable border=0 cellpadding=0 cellspacing=0tbodytrtd class=codediv class=containerdiv class=line number83 index0 alt2code class=java keywordpublic/code code class=java plainString parseExample() /codecode class=java keywordthrows/code code class=java plainIOException, SAXException, TikaException
svn commit: r1693716 [1/5] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.2/ publish/1.3/ pu
Author: nick Date: Sat Aug 1 15:25:44 2015 New Revision: 1693716 URL: http://svn.apache.org/r1693716 Log: Republish with the 1.9 Configuration section in the menu Modified: tika/site/publish/0.10/detection.html tika/site/publish/0.10/formats.html tika/site/publish/0.10/gettingstarted.html tika/site/publish/0.10/index.html tika/site/publish/0.10/parser.html tika/site/publish/0.10/parser_guide.html tika/site/publish/0.5/documentation.html tika/site/publish/0.5/formats.html tika/site/publish/0.5/gettingstarted.html tika/site/publish/0.5/index.html tika/site/publish/0.6/formats.html tika/site/publish/0.6/gettingstarted.html tika/site/publish/0.6/index.html tika/site/publish/0.6/parser.html tika/site/publish/0.7/detection.html tika/site/publish/0.7/formats.html tika/site/publish/0.7/gettingstarted.html tika/site/publish/0.7/index.html tika/site/publish/0.7/parser.html tika/site/publish/0.7/parser_guide.html tika/site/publish/0.8/detection.html tika/site/publish/0.8/formats.html tika/site/publish/0.8/gettingstarted.html tika/site/publish/0.8/index.html tika/site/publish/0.8/parser.html tika/site/publish/0.8/parser_guide.html tika/site/publish/0.9/detection.html tika/site/publish/0.9/formats.html tika/site/publish/0.9/gettingstarted.html tika/site/publish/0.9/index.html tika/site/publish/0.9/parser.html tika/site/publish/0.9/parser_guide.html tika/site/publish/1.0/detection.html tika/site/publish/1.0/formats.html tika/site/publish/1.0/gettingstarted.html tika/site/publish/1.0/index.html tika/site/publish/1.0/parser.html tika/site/publish/1.0/parser_guide.html tika/site/publish/1.1/detection.html tika/site/publish/1.1/formats.html tika/site/publish/1.1/gettingstarted.html tika/site/publish/1.1/index.html tika/site/publish/1.1/parser.html tika/site/publish/1.1/parser_guide.html tika/site/publish/1.10/examples.html tika/site/publish/1.10/formats.html tika/site/publish/1.2/detection.html tika/site/publish/1.2/formats.html tika/site/publish/1.2/gettingstarted.html tika/site/publish/1.2/index.html tika/site/publish/1.2/parser.html tika/site/publish/1.2/parser_guide.html tika/site/publish/1.3/detection.html tika/site/publish/1.3/formats.html tika/site/publish/1.3/gettingstarted.html tika/site/publish/1.3/index.html tika/site/publish/1.3/parser.html tika/site/publish/1.3/parser_guide.html tika/site/publish/1.4/detection.html tika/site/publish/1.4/formats.html tika/site/publish/1.4/gettingstarted.html tika/site/publish/1.4/index.html tika/site/publish/1.4/parser.html tika/site/publish/1.4/parser_guide.html tika/site/publish/1.5/detection.html tika/site/publish/1.5/formats.html tika/site/publish/1.5/gettingstarted.html tika/site/publish/1.5/index.html tika/site/publish/1.5/parser.html tika/site/publish/1.5/parser_guide.html tika/site/publish/1.6/detection.html tika/site/publish/1.6/formats.html tika/site/publish/1.6/gettingstarted.html tika/site/publish/1.6/index.html tika/site/publish/1.6/parser.html tika/site/publish/1.6/parser_guide.html tika/site/publish/1.7/detection.html tika/site/publish/1.7/examples.html tika/site/publish/1.7/formats.html tika/site/publish/1.7/gettingstarted.html tika/site/publish/1.7/index.html tika/site/publish/1.7/parser.html tika/site/publish/1.7/parser_guide.html tika/site/publish/1.8/detection.html tika/site/publish/1.8/examples.html tika/site/publish/1.8/formats.html tika/site/publish/1.8/gettingstarted.html tika/site/publish/1.8/index.html tika/site/publish/1.8/parser.html tika/site/publish/1.8/parser_guide.html tika/site/publish/1.9/detection.html tika/site/publish/1.9/examples.html tika/site/publish/1.9/formats.html tika/site/publish/1.9/gettingstarted.html tika/site/publish/1.9/index.html tika/site/publish/1.9/parser.html tika/site/publish/1.9/parser_guide.html tika/site/publish/contribute.html tika/site/publish/distribution-management.html tika/site/publish/download.html tika/site/publish/index.html tika/site/publish/integration.html tika/site/publish/issue-tracking.html tika/site/publish/license.html tika/site/publish/mail-lists.html tika/site/publish/plugin-management.html tika/site/publish/plugins.html tika/site/publish/project-info.html tika/site/publish/project-summary.html tika/site/publish/source-repository.html tika/site/publish/team-list.html tika/site/src/site/site.xml Modified: tika/site/publish/0.10/detection.html URL: http://svn.apache.org/viewvc/tika/site/publish/0.10/detection.html?rev=1693716r1=1693715r2=1693716view=diff == --- tika/site/publish/0.10
svn commit: r1693746 - /tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java
Author: nick Date: Sat Aug 1 17:39:40 2015 New Revision: 1693746 URL: http://svn.apache.org/r1693746 Log: Empty Translator, similar to the ones for Parser and Detector, for use in testing etc Added: tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java Added: tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java?rev=1693746view=auto == --- tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java (added) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java Sat Aug 1 17:39:40 2015 @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.language.translate; + +/** + * Dummy translator that always declines to give any text. Useful as a + * sentinel translator for when none others are available. + * for unknown document types. + */ +public class EmptyTranslator implements Translator { +public String translate(String text, String sourceLanguage, String targetLanguage) { +return null; +} + +public String translate(String text, String targetLanguage) { +return null; +} + +public boolean isAvailable() { +return true; +} +}
svn commit: r1693715 - in /tika/site: publish/1.9/configuring.html src/site/apt/1.9/configuring.apt
Author: nick Date: Sat Aug 1 15:24:45 2015 New Revision: 1693715 URL: http://svn.apache.org/r1693715 Log: TIKA-1702 more documentation on configuration Added: tika/site/publish/1.9/configuring.html Modified: tika/site/src/site/apt/1.9/configuring.apt Added: tika/site/publish/1.9/configuring.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.9/configuring.html?rev=1693715view=auto == --- tika/site/publish/1.9/configuring.html (added) +++ tika/site/publish/1.9/configuring.html Sat Aug 1 15:24:45 2015 @@ -0,0 +1,394 @@ +!DOCTYPE html PUBLIC -//W3C//DTD XHTML 1.0 Transitional//EN + http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd; + +!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + License); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +-- + + + + + + + +html xmlns=http://www.w3.org/1999/xhtml; + head +meta http-equiv=Content-Type content=text/html; charset=UTF-8 / +titleApache Tika - Configuring Tika/title +style type=text/css media=all + @import url(../css/site.css); +/style +link rel=icon type=image/png href=../tikaNoText16.png / +script type=text/javascript + function selectProvider(form) { +provider = form.elements['searchProvider'].value; +if (provider == any) { + if (Math.random() 0.5) { +provider = lucid; + } else { +provider = sl; + } +} +if (provider == lucid) { + form.action = http://find.searchhub.org/p:tika;; +} else if (provider == sl) { + form.action = http://search-lucene.com/tika;; +} +days = 90; +date = new Date(); +date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); +expires = ; expires= + date.toGMTString(); +document.cookie = searchProvider= + provider + expires + ; path=/; + } + function initProvider() { +if (document.cookie.length0) { + cStart=document.cookie.indexOf(searchProvider=); + if (cStart!=-1) { +cStart=cStart + searchProvider=.length; +cEnd=document.cookie.indexOf(;, cStart); +if (cEnd==-1) { + cEnd=document.cookie.length; +} +provider = unescape(document.cookie.substring(cStart,cEnd)); +document.forms['searchform'].elements['searchProvider'].value = provider; + } +} +document.forms['searchform'].elements['q'].focus(); + } +/script + /head + body onLoad=initProvider(); +div id=body + div id=banner +a href=http://tika.apache.org; id=bannerLeft title=Apache Tika + img src=http://tika.apache.org/tika.png; alt=Apache Tika +width=292 height=100//a +a href=http://www.apache.org/; id=bannerRight + title=The Apache Software Foundation + img src=http://tika.apache.org/asf-logo.gif; alt=The Apache Software Foundation +width=387 height=100//a + /div + div id=content +!-- Licensed to the Apache Software Foundation (ASF) under one or more --!-- contributor license agreements. See the NOTICE file distributed with --!-- this work for additional information regarding copyright ownership. --!-- The ASF licenses this file to You under the Apache License, Version 2.0 --!-- (the License); you may not use this file except in compliance with --!-- the License. You may obtain a copy of the License at --!-- --!-- http://www.apache.org/licenses/LICENSE-2.0 --!-- --!-- Unless required by applicable law or agreed to in writing, software --!-- distributed under the License is distributed on an AS IS BASIS, --!-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. --!-- See the License for the specific language governing permissions and --!-- limitations under the License. --div class=section +h2Configuring Tikaa name=Configuring_Tika/a/h2 +pOut of the box, Apache Tika will attempt to start with all available Detectors and Parsers, running with sensible defaults. For most users, this default configuration will work well./p +pThis page gives you information on how to configure the various components of Apache Tika
svn commit: r1693745 - /tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java
Author: nick Date: Sat Aug 1 17:27:16 2015 New Revision: 1693745 URL: http://svn.apache.org/r1693745 Log: If DefaultTranslator has multiple translators loaded, use the first available, not just blindly the first Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java?rev=1693745r1=1693744r2=1693745view=diff == --- tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java Sat Aug 1 17:27:16 2015 @@ -17,15 +17,22 @@ package org.apache.tika.language.translate; -import org.apache.tika.config.ServiceLoader; -import org.apache.tika.exception.TikaException; - import java.io.IOException; import java.util.Collections; import java.util.Comparator; import java.util.List; -public class DefaultTranslator implements Translator{ +import org.apache.tika.config.ServiceLoader; +import org.apache.tika.exception.TikaException; + +/** + * A translator which picks the first available {@link Translator} + * implementations available through the + * {@link javax.imageio.spi.ServiceRegistry service provider mechanism}. + * + * @since Apache Tika 1.6 + */ +public class DefaultTranslator implements Translator { private transient final ServiceLoader loader; public DefaultTranslator(ServiceLoader loader) { @@ -58,17 +65,39 @@ public class DefaultTranslator implement }); return translators; } +/** + * Returns the first available translator, or null if none are + */ +private static Translator getFirstAvailable(ServiceLoader loader) { +for (Translator t : getDefaultTranslators(loader)) { +if (t.isAvailable()) return t; +} +return null; +} +/** + * Translate, using the first available service-loaded translator + */ public String translate(String text, String sourceLanguage, String targetLanguage) throws TikaException, IOException { -return getDefaultTranslators(loader).get(0).translate(text, sourceLanguage, targetLanguage); +Translator t = getFirstAvailable(loader); +if (t != null) { +return t.translate(text, sourceLanguage, targetLanguage); +} +throw new TikaException(No translators currently available); } +/** + * Translate, using the first available service-loaded translator + */ public String translate(String text, String targetLanguage) throws TikaException, IOException { -return getDefaultTranslators(loader).get(0).translate(text, targetLanguage); +Translator t = getFirstAvailable(loader); +if (t != null) { +return t.translate(text, targetLanguage); +} +throw new TikaException(No translators currently available); } public boolean isAvailable() { -return getDefaultTranslators(loader).get(0).isAvailable(); +return getFirstAvailable(loader) != null; } - }
svn commit: r1693713 - in /tika/trunk/tika-core/src: main/java/org/apache/tika/config/TikaConfig.java test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java
Author: nick Date: Sat Aug 1 14:53:36 2015 New Revision: 1693713 URL: http://svn.apache.org/r1693713 Log: TIKA-1702 Refactor some of the config parser loading to be more re-usable for detectors, and bring the method signature in line WRT Composite vs not (must always be composite) Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1693713r1=1693712r2=1693713view=diff == --- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Sat Aug 1 14:53:36 2015 @@ -66,7 +66,7 @@ public class TikaConfig { return MimeTypes.getDefaultMimeTypes(loader); } -protected Detector getDefaultDetector( +protected CompositeDetector getDefaultDetector( MimeTypes types, ServiceLoader loader) { return new DefaultDetector(types, loader); } @@ -80,7 +80,7 @@ public class TikaConfig { return new DefaultTranslator(loader); } private final CompositeParser parser; -private final Detector detector; +private final CompositeDetector detector; private final Translator translator; private final MimeTypes mimeTypes; @@ -317,6 +317,33 @@ public class TikaConfig { } return null; } +private static ListElement getTopLevelElementChildren(Element element, +String parentName, String childrenName) throws TikaException { +// Should be only zero or one parsers / detectors etc tag +NodeList nodes = element.getElementsByTagName(parentName); +if (nodes.getLength() 1) { +throw new TikaException(Properties may not contain multiple +parentName+ entries); +} +else if (nodes.getLength() == 1) { +// Find only the direct child parser/detector objects +Node parsersE = nodes.item(0); +nodes = parsersE.getChildNodes(); +ListElement elements = new ArrayListElement(); +for (int i = 0; i nodes.getLength(); i++) { +Node node = nodes.item(i); +if (node instanceof Element) { +Element nodeE = (Element)node; +if (childrenName.equals(nodeE.getTagName())) { +elements.add(nodeE); +} +} +} +return elements; +} else { +// No elements of this type +return Collections.emptyList(); +} +} private static MimeTypes typesFromDomElement(Element element) throws TikaException, IOException { @@ -333,24 +360,9 @@ public class TikaConfig { throws TikaException, IOException { ListParser parsers = new ArrayListParser(); -// Should be only zero or one parsers tag -NodeList nodes = element.getElementsByTagName(parsers); -if (nodes.getLength() 1) { -throw new TikaException(Properties may not contain multiple Parsers entries); -} -else if (nodes.getLength() == 1) { -// Find only the direct child parser objects -Node parsersE = nodes.item(0); -nodes = parsersE.getChildNodes(); -for (int i = 0; i nodes.getLength(); i++) { -Node node = nodes.item(i); -if (node instanceof Element) { -Element nodeE = (Element)node; -if (parser.equals(nodeE.getTagName())) { -parsers.add(parserFromParserDomElement(nodeE, mimeTypes, loader)); -} -} -} +// Find the parser children of the parsers tag, if any +for (Element pe : getTopLevelElementChildren(element, parsers, parser)) { +parsers.add(parserFromParserDomElement(pe, mimeTypes, loader)); } if (parsers.isEmpty()) { @@ -500,7 +512,7 @@ public class TikaConfig { return Collections.emptySet(); } -private static Detector detectorFromDomElement( +private static CompositeDetector detectorFromDomElement( Element element, MimeTypes mimeTypes, ServiceLoader loader) throws TikaException, IOException { ListDetector detectors = new ArrayListDetector(); Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java?rev=1693713r1
svn commit: r1693721 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/config/TikaConfig.java tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
Author: nick Date: Sat Aug 1 16:22:28 2015 New Revision: 1693721 URL: http://svn.apache.org/r1693721 Log: TIKA-1702 Start moving to a loader class pattern for common Detector and Parser (+later others) Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1693721r1=1693720r2=1693721view=diff == --- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Sat Aug 1 16:22:28 2015 @@ -125,9 +125,12 @@ public class TikaConfig { private TikaConfig(Element element, ServiceLoader loader) throws TikaException, IOException { +ParserXmlLoader parserLoader = new ParserXmlLoader(); +DetectorXmlLoader detectorLoader = new DetectorXmlLoader(); + this.mimeTypes = typesFromDomElement(element); this.detector = detectorFromDomElement(element, mimeTypes, loader); -this.parser = parserFromDomElement(element, mimeTypes, loader); +this.parser = parserLoader.loadOverall(element, mimeTypes, loader); this.translator = translatorFromDomElement(element, loader); } @@ -204,11 +207,12 @@ public class TikaConfig { } try { -Element element = -getBuilder().parse(stream).getDocumentElement(); +Element element = getBuilder().parse(stream).getDocumentElement(); +ParserXmlLoader parserLoader = new ParserXmlLoader(); +DetectorXmlLoader detectorLoader = new DetectorXmlLoader(); + this.mimeTypes = typesFromDomElement(element); -this.parser = -parserFromDomElement(element, mimeTypes, loader); +this.parser = parserLoader.loadOverall(element, mimeTypes, loader); this.detector = detectorFromDomElement(element, mimeTypes, loader); this.translator = translatorFromDomElement(element, loader); @@ -355,28 +359,28 @@ public class TikaConfig { } } -private static CompositeParser parserFromDomElement( -Element element, MimeTypes mimeTypes, ServiceLoader loader) -throws TikaException, IOException { -ListParser parsers = new ArrayListParser(); - -// Find the parser children of the parsers tag, if any -for (Element pe : getTopLevelElementChildren(element, parsers, parser)) { -parsers.add(parserFromParserDomElement(pe, mimeTypes, loader)); -} - -if (parsers.isEmpty()) { -// No parsers defined, create a DefaultParser -return getDefaultParser(mimeTypes, loader); -} else if (parsers.size() == 1 parsers.get(0) instanceof CompositeParser) { -// Single Composite defined, use that -return (CompositeParser)parsers.get(0); -} else { -// Wrap the defined parsers up in a Composite -MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry(); -return new CompositeParser(registry, parsers); -} -} +//private static CompositeParser parserFromDomElement( +//Element element, MimeTypes mimeTypes, ServiceLoader loader) +//throws TikaException, IOException { +//ListParser parsers = new ArrayListParser(); +// +//// Find the parser children of the parsers tag, if any +//for (Element pe : getTopLevelElementChildren(element, parsers, parser)) { +//parsers.add(parserFromParserDomElement(pe, mimeTypes, loader)); +//} +// +//if (parsers.isEmpty()) { +//// No parsers defined, create a DefaultParser +//return getDefaultParser(mimeTypes, loader); +//} else if (parsers.size() == 1 parsers.get(0) instanceof CompositeParser) { +//// Single Composite defined, use that +//return (CompositeParser)parsers.get(0); +//} else { +//// Wrap the defined parsers up in a Composite +//MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry(); +//return new CompositeParser(registry, parsers); +//} +//} private static Parser parserFromParserDomElement( Element parserNode, MimeTypes mimeTypes, ServiceLoader loader) throws TikaException, IOException { @@ -585,4 +589,79 @@ public class TikaConfig { return translators.get(0); } } + +private static
svn commit: r1693762 - /tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java
Author: nick Date: Sat Aug 1 20:58:07 2015 New Revision: 1693762 URL: http://svn.apache.org/r1693762 Log: Fix up the Probabilistic Mime Detection Test Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java?rev=1693762r1=1693761r2=1693762view=diff == --- tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java (original) +++ tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java Sat Aug 1 20:58:07 2015 @@ -18,6 +18,7 @@ package org.apache.tika.mime; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import java.io.ByteArrayInputStream; @@ -28,10 +29,7 @@ import java.nio.charset.Charset; import org.apache.tika.Tika; import org.apache.tika.config.ServiceLoader; -import org.apache.tika.config.TikaConfig; -import org.apache.tika.detect.CompositeDetector; import org.apache.tika.detect.DefaultProbDetector; -import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.ProbabilisticMimeDetectionSelector.Builder; import org.junit.Before; @@ -39,41 +37,31 @@ import org.junit.Test; public class ProbabilisticMimeDetectionTestWithTika { private static final Charset UTF8 = Charset.forName(UTF-8); -// private ProbabilisticMimeDetectionSelector proDetector; -private Tika tika; + +private ProbabilisticMimeDetectionSelector proSelector; private MediaTypeRegistry registry; +private Tika tika; /** @inheritDoc */ @Before public void setUp() { -try { -registry = MimeTypes.getDefaultMimeTypes().getMediaTypeRegistry(); -tika = new Tika(new TikaConfig() { -@Override -protected CompositeDetector getDefaultDetector(MimeTypes types, -ServiceLoader loader) { -/* - * here is an example with the use of the builder to - * instantiate the object. - */ -Builder builder = new ProbabilisticMimeDetectionSelector.Builder(); -ProbabilisticMimeDetectionSelector proDetector = new ProbabilisticMimeDetectionSelector( -types, builder.priorMagicFileType(0.5f) -.priorExtensionFileType(0.5f) -.priorMetaFileType(0.5f)); -return new DefaultProbDetector(proDetector, loader); -} -}); -} catch (TikaException e) { -// TODO Auto-generated catch block -e.printStackTrace(); -} catch (IOException e) { -// TODO Auto-generated catch block -e.printStackTrace(); -} finally { - -} - +MimeTypes types = MimeTypes.getDefaultMimeTypes(); +ServiceLoader loader = new ServiceLoader(); +registry = types.getMediaTypeRegistry(); + +/* + * here is an example with the use of the builder to + * instantiate the object. + */ +Builder builder = new ProbabilisticMimeDetectionSelector.Builder(); +proSelector = new ProbabilisticMimeDetectionSelector( +types, builder.priorMagicFileType(0.5f) +.priorExtensionFileType(0.5f) +.priorMetaFileType(0.5f)); +DefaultProbDetector detector = new DefaultProbDetector(proSelector, loader); + +// Use a default Tika, except for our different detector +tika = new Tika(detector); } @Test @@ -198,11 +186,6 @@ public class ProbabilisticMimeDetectionT } } -private void assertNotNull(String string, InputStream in) { -// TODO Auto-generated method stub - -} - /** * Test for type detection of empty documents. *
svn commit: r1693759 - in /tika/trunk/tika-core/src: main/java/org/apache/tika/config/ test/java/org/apache/tika/config/ test/resources/org/apache/tika/config/
Author: nick Date: Sat Aug 1 20:40:39 2015 New Revision: 1693759 URL: http://svn.apache.org/r1693759 Log: TIKA-1700 Add TikaConfig constructors that take a ServiceLoader, and add a unit test that shows we (now) use the LoadErrorHandler on that properly for reporting problems with listed class names Added: tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1700-unknown-parser.xml Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java tika/trunk/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java?rev=1693759r1=1693758r2=1693759view=diff == --- tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java Sat Aug 1 20:40:39 2015 @@ -173,6 +173,10 @@ public class ServiceLoader { /** * Loads and returns the named service class that's expected to implement * the given interface. + * + * Note that this class does not use the {@link LoadErrorHandler}, a + * {@link ClassNotFoundException} is always returned for unknown + * classes or classes of the wrong type * * @param iface service interface * @param name service class name Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1693759r1=1693758r2=1693759view=diff == --- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Sat Aug 1 20:40:39 2015 @@ -92,18 +92,25 @@ public class TikaConfig { public TikaConfig(File file) throws TikaException, IOException, SAXException { -this(getBuilder().parse(file)); +this(file, new ServiceLoader()); +} +public TikaConfig(File file, ServiceLoader loader) +throws TikaException, IOException, SAXException { +this(getBuilder().parse(file), loader); } public TikaConfig(URL url) throws TikaException, IOException, SAXException { this(url, ServiceLoader.getContextClassLoader()); } - public TikaConfig(URL url, ClassLoader loader) throws TikaException, IOException, SAXException { this(getBuilder().parse(url.toString()).getDocumentElement(), loader); } +public TikaConfig(URL url, ServiceLoader loader) +throws TikaException, IOException, SAXException { +this(getBuilder().parse(url.toString()).getDocumentElement(), loader); +} public TikaConfig(InputStream stream) throws TikaException, IOException, SAXException { @@ -113,6 +120,9 @@ public class TikaConfig { public TikaConfig(Document document) throws TikaException, IOException { this(document.getDocumentElement()); } +public TikaConfig(Document document, ServiceLoader loader) throws TikaException, IOException { +this(document.getDocumentElement(), loader); +} public TikaConfig(Element element) throws TikaException, IOException { this(element, new ServiceLoader()); @@ -418,7 +428,8 @@ public class TikaConfig { // Find the children of the parent tag, if any for (Element le : getTopLevelElementChildren(element, getParentTagName(), getLoaderTagName())) { -loaded.add(loadOne(le, mimeTypes, loader)); +T loadedChild = loadOne(le, mimeTypes, loader); +if (loadedChild != null) loaded.add(loadedChild); } // Build the classes, and wrap as needed @@ -462,9 +473,9 @@ public class TikaConfig { NodeList childNodes = element.getElementsByTagName(getLoaderTagName()); if (childNodes.getLength() 0) { for (int i = 0; i childNodes.getLength(); i++) { -children.add(loadOne( -(Element)childNodes.item(i), mimeTypes, loader -)); +T loadedChild = loadOne((Element)childNodes.item(i), +mimeTypes, loader); +if (loadedChild != null) children.add(loadedChild
svn commit: r1688805 - /tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Author: nick Date: Thu Jul 2 10:35:06 2015 New Revision: 1688805 URL: http://svn.apache.org/r1688805 Log: Remove change comment, TIKA-1602 Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1688805r1=1688804r2=1688805view=diff == --- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original) +++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Thu Jul 2 10:35:06 2015 @@ -5112,7 +5112,7 @@ mime-type type=message/rfc822 magic priority=50 - match value=Status: type=string offset=0/ !-- added custom by Jeremy B. Merril 4/10/14 -- + match value=Status: type=string offset=0/ match value=Relay-Version: type=stringignorecase offset=0/ match value=#!\ rnews type=string offset=0/ match value=N#!\ rnews type=string offset=0/
svn commit: r1688810 - in /tika/site: publish/1.10/ publish/1.6/ publish/1.7/ publish/1.8/ publish/1.9/ src/site/apt/1.10/ src/site/apt/1.6/ src/site/apt/1.7/ src/site/apt/1.8/ src/site/apt/1.9/
Author: nick Date: Thu Jul 2 12:17:21 2015 New Revision: 1688810 URL: http://svn.apache.org/r1688810 Log: Mention Outlook MSG support in the mail formats section Modified: tika/site/publish/1.10/formats.html tika/site/publish/1.6/formats.html tika/site/publish/1.7/formats.html tika/site/publish/1.8/formats.html tika/site/publish/1.9/formats.html tika/site/src/site/apt/1.10/formats.apt tika/site/src/site/apt/1.6/formats.apt tika/site/src/site/apt/1.7/formats.apt tika/site/src/site/apt/1.8/formats.apt tika/site/src/site/apt/1.9/formats.apt Modified: tika/site/publish/1.10/formats.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.10/formats.html?rev=1688810r1=1688809r2=1688810view=diff == --- tika/site/publish/1.10/formats.html (original) +++ tika/site/publish/1.10/formats.html Thu Jul 2 12:17:21 2015 @@ -176,6 +176,7 @@ pThe a href=./api/org/apache/tika/parser/mbox/MboxParser.htmlMboxParser/a can extract email messages from the mbox format used by many email archives and Unix-style mailboxes./p pThe a href=./api/org/apache/tika/parser/mail/RFC822Parser.htmlRFC822Parser/a can process single email messages in the RFC 822 format used by many email clients in their archives / exports./p pThe a href=./api/org/apache/tika/parser/mbox/OutlookPSTParser.htmlOutlookPSTParser/a can extract email messages from the Microsoft Outlook PST email format./p +pThe a href=./api/org/apache/tika/parser/microsoft/OutlookExtractor.htmlOutlookExtractor/a (part of a href=./api/org/apache/tika/parser/microsoft/OfficeParserOfficeParser/a) is able to extract email messages from the Microsoft Outlook MSG email format./p pThe a href=./api/org/apache/tika/parser/microsoft/TNEFParser.htmlTNEFParser/a can extract email attachments from the Microsoft TNEF (Transport Neutral Encoding Format, aka Winmail.dat) used with some Microsoft email clients./p/div div class=section h3a name=CAD_formatsCAD formats/a/h3 Modified: tika/site/publish/1.6/formats.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.6/formats.html?rev=1688810r1=1688809r2=1688810view=diff == --- tika/site/publish/1.6/formats.html (original) +++ tika/site/publish/1.6/formats.html Thu Jul 2 12:17:21 2015 @@ -172,7 +172,8 @@ h3a name=Mail_formatsMail formats/a/h3 pThe a href=./api/org/apache/tika/parser/mbox/MboxParser.htmlMboxParser/a can extract email messages from the mbox format used by many email archives and Unix-style mailboxes./p pThe a href=./api/org/apache/tika/parser/mail/RFC822Parser.htmlRFC822Parser/a can process single email messages in the RFC 822 format used by many email clients in their archives / exports./p -pThe a href=./api/org/apache/tika/parser/mbox/PSTParser.htmlPSDParser/a can extract email messages from the Microsoft Outlook PST email format./p/div +pThe a href=./api/org/apache/tika/parser/mbox/PSTParser.htmlPSDParser/a can extract email messages from the Microsoft Outlook PST email format./p +pThe a href=./api/org/apache/tika/parser/microsoft/OutlookExtractor.htmlOutlookExtractor/a (part of a href=./api/org/apache/tika/parser/microsoft/OfficeParserOfficeParser/a) is able to extract email messages from the Microsoft Outlook MSG email format./p/div div class=section h3a name=CAD_formatsCAD formats/a/h3 pThe a href=./api/org/apache/tika/parser/dwg/DWGParser.htmlDWGParser/a can extract simple metadata from the DWG CAD format./p/div Modified: tika/site/publish/1.7/formats.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.7/formats.html?rev=1688810r1=1688809r2=1688810view=diff == --- tika/site/publish/1.7/formats.html (original) +++ tika/site/publish/1.7/formats.html Thu Jul 2 12:17:21 2015 @@ -174,7 +174,8 @@ h3a name=Mail_formatsMail formats/a/h3 pThe a href=./api/org/apache/tika/parser/mbox/MboxParser.htmlMboxParser/a can extract email messages from the mbox format used by many email archives and Unix-style mailboxes./p pThe a href=./api/org/apache/tika/parser/mail/RFC822Parser.htmlRFC822Parser/a can process single email messages in the RFC 822 format used by many email clients in their archives / exports./p -pThe a href=./api/org/apache/tika/parser/mbox/PSTParser.htmlPSDParser/a can extract email messages from the Microsoft Outlook PST email format./p/div +pThe a href=./api/org/apache/tika/parser/mbox/PSTParser.htmlPSDParser/a can extract email messages from the Microsoft Outlook PST email format./p +pThe a href=./api/org/apache/tika/parser/microsoft/OutlookExtractor.htmlOutlookExtractor/a (part of a href=./api/org/apache/tika/parser/microsoft/OfficeParserOfficeParser/a) is able to extract email messages from the Microsoft Outlook MSG email format./p/div div class=section h3a name=CAD_formatsCAD
svn commit: r1687945 - in /tika/site/src/site/apt: 0.10/parser.apt 1.10/examples.apt 1.3/parser.apt 1.4/parser.apt 1.5/parser.apt 1.6/parser.apt 1.7/examples.apt 1.7/parser.apt 1.8/examples.apt 1.8/pa
Author: nick Date: Sat Jun 27 16:13:01 2015 New Revision: 1687945 URL: http://svn.apache.org/r1687945 Log: Tika javadocs are in /api/ not /apidocs/, correct links Modified: tika/site/src/site/apt/0.10/parser.apt tika/site/src/site/apt/1.10/examples.apt tika/site/src/site/apt/1.3/parser.apt tika/site/src/site/apt/1.4/parser.apt tika/site/src/site/apt/1.5/parser.apt tika/site/src/site/apt/1.6/parser.apt tika/site/src/site/apt/1.7/examples.apt tika/site/src/site/apt/1.7/parser.apt tika/site/src/site/apt/1.8/examples.apt tika/site/src/site/apt/1.8/parser.apt tika/site/src/site/apt/1.9/examples.apt tika/site/src/site/apt/1.9/parser.apt Modified: tika/site/src/site/apt/0.10/parser.apt URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/0.10/parser.apt?rev=1687945r1=1687944r2=1687945view=diff == --- tika/site/src/site/apt/0.10/parser.apt (original) +++ tika/site/src/site/apt/0.10/parser.apt Sat Jun 27 16:13:01 2015 @@ -134,7 +134,7 @@ try { --- Parser implementations typically use the - {{{./apidocs/org/apache/tika/sax/XHTMLContentHandler.html}XHTMLContentHandler}} + {{{./api/org/apache/tika/sax/XHTMLContentHandler.html}XHTMLContentHandler}} utility class to generate the XHTML output. Dealing with the raw SAX events can be a bit complex, so Apache Tika @@ -238,7 +238,7 @@ try { Tika also contains some general purpose parser implementations that are not targeted at any specific document formats. The most notable of these - is the {{{./apidocs/org/apache/tika/parser/AutoDetectParser.html}AutoDetectParser}} + is the {{{./api/org/apache/tika/parser/AutoDetectParser.html}AutoDetectParser}} class that encapsulates all Tika functionality into a single parser that can handle any types of documents. This parser will automatically determine the type of the incoming document based on various heuristics and will then Modified: tika/site/src/site/apt/1.10/examples.apt URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/1.10/examples.apt?rev=1687945r1=1687944r2=1687945view=diff == --- tika/site/src/site/apt/1.10/examples.apt (original) +++ tika/site/src/site/apt/1.10/examples.apt Sat Jun 27 16:13:01 2015 @@ -34,7 +34,7 @@ Apache Tika API Usage Examples ** {Parsing using the Tika Facade} - The {{{./apidocs/org/apache/tika/Tika.html}Tika facade}}, + The {{{./api/org/apache/tika/Tika.html}Tika facade}}, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @@ -43,9 +43,9 @@ Apache Tika API Usage Examples ** {Parsing using the Auto-Detect Parser} For more control, you can call the - {{{./apidocs/org/apache/tika/parser/Parser.html}Tika Parsers}} + {{{./api/org/apache/tika/parser/Parser.html}Tika Parsers}} directly. Most likely, you'll want to start out using the - {{{./apidocs/org/apache/tika/parser/AutoDetectParser.html}Auto-Detect Parser}}, + {{{./api/org/apache/tika/parser/AutoDetectParser.html}Auto-Detect Parser}}, which automatically figures out what kind of content you have, then calls the appropriate parser for you. @@ -63,7 +63,7 @@ Apache Tika API Usage Examples ** {Parsing to Plain Text} By using the - {{{./apidocs/org/apache/tika/sax/BodyContentHandler.html}BodyContentHandler}}, + {{{./api/org/apache/tika/sax/BodyContentHandler.html}BodyContentHandler}}, you can request that Tika return only the content of the document's body as a plain-text string. @@ -72,15 +72,15 @@ Apache Tika API Usage Examples ** {Parsing to XHTML} By using the - {{{./apidocs/org/apache/tika/sax/ToXMLContentHandler.html}ToXMLContentHandler}}, + {{{./api/org/apache/tika/sax/ToXMLContentHandler.html}ToXMLContentHandler}}, you can get the XHTML content of the whole document as a string. %{include|source=src/examples-src/main/java/org/apache/tika/example/ContentHandlerExample.java|snippet=aj:..parseToHTML()|show-gutter=false} If you just want the body of the xhtml document, without the header, you can chain together a - {{{./apidocs/org/apache/tika/sax/BodyContentHandler.html}BodyContentHandler}} - and a {{{./apidocs/org/apache/tika/sax/ToXMLContentHandler.html}ToXMLContentHandler}} + {{{./api/org/apache/tika/sax/BodyContentHandler.html}BodyContentHandler}} + and a {{{./api/org/apache/tika/sax/ToXMLContentHandler.html}ToXMLContentHandler}} as shown: %{include|source=src/examples-src/main/java/org/apache/tika/example/ContentHandlerExample.java|snippet=aj:..parseBodyToHTML()|show-gutter=false} @@ -103,7 +103,7 @@ Apache Tika API Usage Examples ** {Extract Phone Numbers from Content into the Metadata} By using the - {{{./apidocs/org/apache/tika/sax/PhoneExtractingContentHandler.html
svn commit: r1687946 [2/4] - in /tika/site/publish: 0.10/parser.html 1.10/examples.html 1.3/parser.html 1.4/parser.html 1.5/parser.html 1.6/parser.html 1.7/examples.html 1.7/parser.html 1.8/examples.h
Modified: tika/site/publish/1.7/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.7/examples.html?rev=1687946r1=1687945r2=1687946view=diff == --- tika/site/publish/1.7/examples.html (original) +++ tika/site/publish/1.7/examples.html Sat Jun 27 16:15:19 2015 @@ -112,23 +112,23 @@ pTika provides a number of different ways to parse a file. These provide different levels of control, flexibility, and complexity./p div class=section h4a name=Parsing_using_the_Tika_FacadeParsing using the Tika Facade/a/h4 -pThe a href=./apidocs/org/apache/tika/Tika.htmlTika facade/a, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text/pstyle type=text/css +pThe a href=./api/org/apache/tika/Tika.htmlTika facade/a, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text/pstyle type=text/css @import url('attached-includes/css/shCoreDefault.css'); /style div id=highlighter_166145 class=syntaxhighlighter nogutter javatable border=0 cellpadding=0 cellspacing=0tbodytrtd class=codediv class=containerdiv class=line number49 index0 alt2code class=java keywordpublic/code code class=java plainString parseToStringExample() /codecode class=java keywordthrows/code code class=java plainIOException, SAXException, TikaException {/code/divdiv class=line number50 index1 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainInputStream stream = ParsingExample./codecode class=java keywordclass/codecode class=java plain.getResourceAsStream(/codecode class=java stringtest.doc/codecode class=java plain);/code/divdiv class=line number51 index2 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainTika tika = /codecode class=java keywordnew/code code class=java plainTika();/code/div div class=line number52 index3 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java keywordtry/code code class=java plain{/code/divdiv class=line number53 index4 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode class=java keywordreturn/code code class=java plaintika.parseToString(stream);/code/divdiv class=line number54 index5 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plain} /codecode class=java keywordfinally/code code class=java plain{/code/divdiv class=line number55 index6 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode class=java plainstream.close();/code/divdiv class=line number56 index7 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plain}/code/divdiv class=line number57 index8 alt2code class=java plain}/code/div/ div/td/tr/tbody/table/div/div div class=section h4a name=Parsing_using_the_Auto-Detect_ParserParsing using the Auto-Detect Parser/a/h4 -pFor more control, you can call the a href=./apidocs/org/apache/tika/parser/Parser.htmlTika Parsers/a directly. Most likely, you'll want to start out using the a href=./apidocs/org/apache/tika/parser/AutoDetectParser.htmlAuto-Detect Parser/a, which automatically figures out what kind of content you have, then calls the appropriate parser for you./pdiv id=highlighter_969506 class=syntaxhighlighter nogutter javatable border=0 cellpadding=0 cellspacing=0tbodytrtd class=codediv class=containerdiv class=line number83 index0 alt2code class=java keywordpublic/code code class=java plainString parseExample() /codecode class=java keywordthrows/code code class=java plainIOException, SAXException, TikaException {/code/divdiv class=line number84 index1 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainInputStream stream = ParsingExample./codecode class=java keywordclass/codec ode class=java plain.getResourceAsStream(/codecode class=java stringtest.doc/codecode class=java plain);/code/divdiv class=line number85 index2 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainAutoDetectParser parser = /codecode class=java keywordnew/code code class=java plainAutoDetectParser();/code/divdiv class=line number86 index3 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainBodyContentHandler handler = /codecode class=java keywordnew/code code class=java plainBodyContentHandler();/code/divdiv class=line number87 index4 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainMetadata metadata = /codecode class=java keywordnew/code code class=java plainMetadata();/code/divdiv class=line number88 index5 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java keywordtr y/code code class=java plain{/code/divdiv class=line number89 index6 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode class=java plainparser.parse(stream, handler, metadata);/code/divdiv class=line number90 index7 alt1code
svn commit: r1687946 [4/4] - in /tika/site/publish: 0.10/parser.html 1.10/examples.html 1.3/parser.html 1.4/parser.html 1.5/parser.html 1.6/parser.html 1.7/examples.html 1.7/parser.html 1.8/examples.h
Modified: tika/site/publish/1.9/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.9/examples.html?rev=1687946r1=1687945r2=1687946view=diff == --- tika/site/publish/1.9/examples.html (original) +++ tika/site/publish/1.9/examples.html Sat Jun 27 16:15:19 2015 @@ -113,23 +113,23 @@ pTika provides a number of different ways to parse a file. These provide different levels of control, flexibility, and complexity./p div class=section h4a name=Parsing_using_the_Tika_FacadeParsing using the Tika Facade/a/h4 -pThe a href=./apidocs/org/apache/tika/Tika.htmlTika facade/a, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text/pstyle type=text/css +pThe a href=./api/org/apache/tika/Tika.htmlTika facade/a, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text/pstyle type=text/css @import url('attached-includes/css/shCoreDefault.css'); /style div id=highlighter_294673 class=syntaxhighlighter nogutter javatable border=0 cellpadding=0 cellspacing=0tbodytrtd class=codediv class=containerdiv class=line number49 index0 alt2code class=java keywordpublic/code code class=java plainString parseToStringExample() /codecode class=java keywordthrows/code code class=java plainIOException, SAXException, TikaException {/code/divdiv class=line number50 index1 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainInputStream stream = ParsingExample./codecode class=java keywordclass/codecode class=java plain.getResourceAsStream(/codecode class=java stringtest.doc/codecode class=java plain);/code/divdiv class=line number51 index2 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainTika tika = /codecode class=java keywordnew/code code class=java plainTika();/code/div div class=line number52 index3 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java keywordtry/code code class=java plain{/code/divdiv class=line number53 index4 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode class=java keywordreturn/code code class=java plaintika.parseToString(stream);/code/divdiv class=line number54 index5 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plain} /codecode class=java keywordfinally/code code class=java plain{/code/divdiv class=line number55 index6 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode class=java plainstream.close();/code/divdiv class=line number56 index7 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plain}/code/divdiv class=line number57 index8 alt2code class=java plain}/code/div/ div/td/tr/tbody/table/div/div div class=section h4a name=Parsing_using_the_Auto-Detect_ParserParsing using the Auto-Detect Parser/a/h4 -pFor more control, you can call the a href=./apidocs/org/apache/tika/parser/Parser.htmlTika Parsers/a directly. Most likely, you'll want to start out using the a href=./apidocs/org/apache/tika/parser/AutoDetectParser.htmlAuto-Detect Parser/a, which automatically figures out what kind of content you have, then calls the appropriate parser for you./pdiv id=highlighter_420078 class=syntaxhighlighter nogutter javatable border=0 cellpadding=0 cellspacing=0tbodytrtd class=codediv class=containerdiv class=line number83 index0 alt2code class=java keywordpublic/code code class=java plainString parseExample() /codecode class=java keywordthrows/code code class=java plainIOException, SAXException, TikaException {/code/divdiv class=line number84 index1 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainInputStream stream = ParsingExample./codecode class=java keywordclass/codec ode class=java plain.getResourceAsStream(/codecode class=java stringtest.doc/codecode class=java plain);/code/divdiv class=line number85 index2 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainAutoDetectParser parser = /codecode class=java keywordnew/code code class=java plainAutoDetectParser();/code/divdiv class=line number86 index3 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainBodyContentHandler handler = /codecode class=java keywordnew/code code class=java plainBodyContentHandler();/code/divdiv class=line number87 index4 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainMetadata metadata = /codecode class=java keywordnew/code code class=java plainMetadata();/code/divdiv class=line number88 index5 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java keywordtr y/code code class=java plain{/code/divdiv class=line number89 index6 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode class=java plainparser.parse(stream, handler, metadata);/code/divdiv class=line number90 index7 alt1code
svn commit: r1687946 [3/4] - in /tika/site/publish: 0.10/parser.html 1.10/examples.html 1.3/parser.html 1.4/parser.html 1.5/parser.html 1.6/parser.html 1.7/examples.html 1.7/parser.html 1.8/examples.h
Modified: tika/site/publish/1.8/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.8/examples.html?rev=1687946r1=1687945r2=1687946view=diff == --- tika/site/publish/1.8/examples.html (original) +++ tika/site/publish/1.8/examples.html Sat Jun 27 16:15:19 2015 @@ -112,23 +112,23 @@ pTika provides a number of different ways to parse a file. These provide different levels of control, flexibility, and complexity./p div class=section h4a name=Parsing_using_the_Tika_FacadeParsing using the Tika Facade/a/h4 -pThe a href=./apidocs/org/apache/tika/Tika.htmlTika facade/a, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text/pstyle type=text/css +pThe a href=./api/org/apache/tika/Tika.htmlTika facade/a, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text/pstyle type=text/css @import url('attached-includes/css/shCoreDefault.css'); /style div id=highlighter_489927 class=syntaxhighlighter nogutter javatable border=0 cellpadding=0 cellspacing=0tbodytrtd class=codediv class=containerdiv class=line number49 index0 alt2code class=java keywordpublic/code code class=java plainString parseToStringExample() /codecode class=java keywordthrows/code code class=java plainIOException, SAXException, TikaException {/code/divdiv class=line number50 index1 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainInputStream stream = ParsingExample./codecode class=java keywordclass/codecode class=java plain.getResourceAsStream(/codecode class=java stringtest.doc/codecode class=java plain);/code/divdiv class=line number51 index2 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainTika tika = /codecode class=java keywordnew/code code class=java plainTika();/code/div div class=line number52 index3 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java keywordtry/code code class=java plain{/code/divdiv class=line number53 index4 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode class=java keywordreturn/code code class=java plaintika.parseToString(stream);/code/divdiv class=line number54 index5 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plain} /codecode class=java keywordfinally/code code class=java plain{/code/divdiv class=line number55 index6 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode class=java plainstream.close();/code/divdiv class=line number56 index7 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plain}/code/divdiv class=line number57 index8 alt2code class=java plain}/code/div/ div/td/tr/tbody/table/div/div div class=section h4a name=Parsing_using_the_Auto-Detect_ParserParsing using the Auto-Detect Parser/a/h4 -pFor more control, you can call the a href=./apidocs/org/apache/tika/parser/Parser.htmlTika Parsers/a directly. Most likely, you'll want to start out using the a href=./apidocs/org/apache/tika/parser/AutoDetectParser.htmlAuto-Detect Parser/a, which automatically figures out what kind of content you have, then calls the appropriate parser for you./pdiv id=highlighter_934037 class=syntaxhighlighter nogutter javatable border=0 cellpadding=0 cellspacing=0tbodytrtd class=codediv class=containerdiv class=line number83 index0 alt2code class=java keywordpublic/code code class=java plainString parseExample() /codecode class=java keywordthrows/code code class=java plainIOException, SAXException, TikaException {/code/divdiv class=line number84 index1 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainInputStream stream = ParsingExample./codecode class=java keywordclass/codec ode class=java plain.getResourceAsStream(/codecode class=java stringtest.doc/codecode class=java plain);/code/divdiv class=line number85 index2 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainAutoDetectParser parser = /codecode class=java keywordnew/code code class=java plainAutoDetectParser();/code/divdiv class=line number86 index3 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainBodyContentHandler handler = /codecode class=java keywordnew/code code class=java plainBodyContentHandler();/code/divdiv class=line number87 index4 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainMetadata metadata = /codecode class=java keywordnew/code code class=java plainMetadata();/code/divdiv class=line number88 index5 alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java keywordtr y/code code class=java plain{/code/divdiv class=line number89 index6 alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode class=java plainparser.parse(stream, handler, metadata);/code/divdiv class=line number90 index7 alt1code
svn commit: r1687946 [1/4] - in /tika/site/publish: 0.10/parser.html 1.10/examples.html 1.3/parser.html 1.4/parser.html 1.5/parser.html 1.6/parser.html 1.7/examples.html 1.7/parser.html 1.8/examples.h
Author: nick Date: Sat Jun 27 16:15:19 2015 New Revision: 1687946 URL: http://svn.apache.org/r1687946 Log: Tika javadocs are in /api/ not /apidocs/, correct links Modified: tika/site/publish/0.10/parser.html tika/site/publish/1.10/examples.html tika/site/publish/1.3/parser.html tika/site/publish/1.4/parser.html tika/site/publish/1.5/parser.html tika/site/publish/1.6/parser.html tika/site/publish/1.7/examples.html tika/site/publish/1.7/parser.html tika/site/publish/1.8/examples.html tika/site/publish/1.8/parser.html tika/site/publish/1.9/examples.html tika/site/publish/1.9/parser.html Modified: tika/site/publish/0.10/parser.html URL: http://svn.apache.org/viewvc/tika/site/publish/0.10/parser.html?rev=1687946r1=1687945r2=1687946view=diff == --- tika/site/publish/0.10/parser.html (original) +++ tika/site/publish/0.10/parser.html Sat Jun 27 16:15:19 2015 @@ -131,7 +131,7 @@ try { ... lt;/bodygt; lt;/htmlgt;/pre/div -pParser implementations typically use the a href=./apidocs/org/apache/tika/sax/XHTMLContentHandler.htmlXHTMLContentHandler/a utility class to generate the XHTML output./p +pParser implementations typically use the a href=./api/org/apache/tika/sax/XHTMLContentHandler.htmlXHTMLContentHandler/a utility class to generate the XHTML output./p pDealing with the raw SAX events can be a bit complex, so Apache Tika comes with a number of utility classes that can be used to process and convert the event stream to other representations./p pFor example, the a href=./api/org/apache/tika/sax/BodyContentHandler.htmlBodyContentHandler/a class can be used to extract just the body part of the XHTML output and feed it either as SAX events to another content handler or as characters to an output stream, a writer, or simply a string. The following code snippet parses a document from the standard input stream and outputs the extracted text content to standard output:/p div @@ -173,7 +173,7 @@ try { h3Parser implementationsa name=Parser_implementations/a/h3 pApache Tika comes with a number of parser classes for parsing a href=./formats.htmlvarious document formats/a. You can also extend Tika with your own parsers, and of course any contributions to Tika are warmly welcome./p pThe goal of Tika is to reuse existing parser libraries like a class=externalLink href=http://pdfbox.apache.org/;PDFBox/a or a class=externalLink href=http://poi.apache.org/;Apache POI/a as much as possible, and so most of the parser classes in Tika are adapters to such external libraries./p -pTika also contains some general purpose parser implementations that are not targeted at any specific document formats. The most notable of these is the a href=./apidocs/org/apache/tika/parser/AutoDetectParser.htmlAutoDetectParser/a class that encapsulates all Tika functionality into a single parser that can handle any types of documents. This parser will automatically determine the type of the incoming document based on various heuristics and will then parse the document accordingly./p/div/div +pTika also contains some general purpose parser implementations that are not targeted at any specific document formats. The most notable of these is the a href=./api/org/apache/tika/parser/AutoDetectParser.htmlAutoDetectParser/a class that encapsulates all Tika functionality into a single parser that can handle any types of documents. This parser will automatically determine the type of the incoming document based on various heuristics and will then parse the document accordingly./p/div/div /div div id=sidebar div id=navigation Modified: tika/site/publish/1.10/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.10/examples.html?rev=1687946r1=1687945r2=1687946view=diff == --- tika/site/publish/1.10/examples.html (original) +++ tika/site/publish/1.10/examples.html Sat Jun 27 16:15:19 2015 @@ -113,23 +113,23 @@ pTika provides a number of different ways to parse a file. These provide different levels of control, flexibility, and complexity./p div class=section h4a name=Parsing_using_the_Tika_FacadeParsing using the Tika Facade/a/h4 -pThe a href=./apidocs/org/apache/tika/Tika.htmlTika facade/a, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text/pstyle type=text/css +pThe a href=./api/org/apache/tika/Tika.htmlTika facade/a, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text/pstyle type=text/css @import url('attached-includes/css/shCoreDefault.css'); /style div id=highlighter_177280 class=syntaxhighlighter nogutter javatable border=0 cellpadding=0 cellspacing=0tbodytrtd class=codediv class=containerdiv class=line number49 index0 alt2code
svn commit: r1687102 - in /tika/site/src/site/apt/1.10: ./ examples.apt formats.apt
Author: nick Date: Tue Jun 23 17:29:17 2015 New Revision: 1687102 URL: http://svn.apache.org/r1687102 Log: Start tracking formats and examples for 1.10 Added: tika/site/src/site/apt/1.10/ tika/site/src/site/apt/1.10/examples.apt - copied unchanged from r1687095, tika/site/src/site/apt/1.9/examples.apt tika/site/src/site/apt/1.10/formats.apt - copied, changed from r1687095, tika/site/src/site/apt/1.9/formats.apt Copied: tika/site/src/site/apt/1.10/formats.apt (from r1687095, tika/site/src/site/apt/1.9/formats.apt) URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/1.10/formats.apt?p2=tika/site/src/site/apt/1.10/formats.aptp1=tika/site/src/site/apt/1.9/formats.aptr1=1687095r2=1687102rev=1687102view=diff == --- tika/site/src/site/apt/1.9/formats.apt (original) +++ tika/site/src/site/apt/1.10/formats.apt Tue Jun 23 17:29:17 2015 @@ -20,7 +20,7 @@ Supported Document Formats This page lists all the document formats supported by the parsers in - Apache Tika 1.9. Follow the links to the various parser class javadocs + Apache Tika 1.10. Follow the links to the various parser class javadocs for more detailed information about each document format and how it is parsed by Tika. @@ -248,6 +248,10 @@ Supported Document Formats is able to extract attribute metadata from the ISO-19139 georgraphic information file format. + The {{{./api/org/apache/tika/parser/geo/topic/GeoParser.html}GeoParser}} + is makes use of a pre-built collection of a geographic gazetteer, to + resolve geographic entities into their positions into the metadata + The {{{./api/org/apache/tika/parser/grib/GribParser.html}GribParser}} is able to extract attribute metadata from the Grib scientific file format.
svn commit: r1686199 - /tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Author: nick Date: Thu Jun 18 12:06:20 2015 New Revision: 1686199 URL: http://svn.apache.org/r1686199 Log: Add a mime type definition for Java properties files, after a discussion on stackoverflow showed we didn't have one Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1686199r1=1686198r2=1686199view=diff == --- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original) +++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Thu Jun 18 12:06:20 2015 @@ -5392,7 +5392,6 @@ glob pattern=*.pod/ glob pattern=*.pom/ glob pattern=*.project/ -glob pattern=*.properties/ glob pattern=*.rng/ glob pattern=*.rnx/ glob pattern=*.roles/ @@ -5735,6 +5734,14 @@ sub-class-of type=text/plain/ /mime-type + mime-type type=text/x-java-properties +_commentJava Properties/_comment +alias type=text/x-properties / +alias type=text/properties / +glob pattern=*.properties/ +sub-class-of type=text/plain/ + /mime-type + mime-type type=text/x-jsp _commentJava Server Page/_comment alias type=application/x-httpd-jsp/
svn commit: r1686315 - in /tika/trunk/tika-parsers/src/test: java/org/apache/tika/mime/TestMimeTypes.java resources/test-documents/testJAVAPROPS.properties
Author: nick Date: Thu Jun 18 20:12:54 2015 New Revision: 1686315 URL: http://svn.apache.org/r1686315 Log: TIKA-1660 Java Properties sample file and detection test, follows on from r1686199 Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testJAVAPROPS.properties Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1686315r1=1686314r2=1686315view=diff == --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Thu Jun 18 20:12:54 2015 @@ -944,6 +944,9 @@ public class TestMimeTypes { assertTypeByData(text/x-csrc, testC.c); assertTypeByData(text/x-chdr, testH.h); +assertTypeByName(text/x-java-source, testJAVA.java); +assertType(text/x-java-properties, testJAVAPROPS.properties); + assertType(text/x-matlab, testMATLAB.m); assertType(text/x-matlab, testMATLAB_wtsgaus.m); assertType(text/x-matlab, testMATLAB_barcast.m); @@ -970,6 +973,7 @@ public class TestMimeTypes { private void assertType(String expected, String filename) throws Exception { InputStream stream = TestMimeTypes.class.getResourceAsStream( /test-documents/ + filename); +assertNotNull(Test file not found: + filename, stream); try { Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, filename); Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testJAVAPROPS.properties URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testJAVAPROPS.properties?rev=1686315view=auto == --- tika/trunk/tika-parsers/src/test/resources/test-documents/testJAVAPROPS.properties (added) +++ tika/trunk/tika-parsers/src/test/resources/test-documents/testJAVAPROPS.properties Thu Jun 18 20:12:54 2015 @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the License); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +tika=great +file.content.detection=often.hard +properties\:files=fun with special characters... + +# Logs please! +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.layout=org.apache.log4j.PatternLayout +log4j.appender.stdout.layout.ConversionPattern=%5p [%t] (%F:%L) - %m%n
svn commit: r1684187 - in /tika/trunk: tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java
Author: nick Date: Mon Jun 8 13:55:16 2015 New Revision: 1684187 URL: http://svn.apache.org/r1684187 Log: Improve how the Tika CLI reports decorated parsers in --list-parsers Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1684187r1=1684186r2=1684187view=diff == --- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original) +++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Mon Jun 8 13:55:16 2015 @@ -702,12 +702,20 @@ public class TikaCLI { } private void displayParser(Parser p, boolean includeMimeTypes, boolean apt, int i) { +String decorated = null; +if (p instanceof ParserDecorator) { +ParserDecorator pd = (ParserDecorator)p; +decorated = (Wrapped by + pd.getDecorationName() + ); +p = pd.getWrappedParser(); +} + boolean isComposite = (p instanceof CompositeParser); -String name = (p instanceof ParserDecorator) ? - ((ParserDecorator) p).getWrappedParser().getClass().getName() : - p.getClass().getName(); -if (apt){ +String name = p.getClass().getName(); + +if (apt) { name = name.substring(0, name.lastIndexOf(.) + 1) + {{{./api/ + name.replace(., /) + } + name.substring(name.lastIndexOf(.) + 1) + }}; +} else if (decorated != null) { +name += decorated; } if ((apt !isComposite) || !apt) {// Don't display Composite parsers in the apt output. System.out.println(indent(i) + ((apt) ? * : ) + name + (isComposite ? (Composite Parser): : )); Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java?rev=1684187r1=1684186r2=1684187view=diff == --- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java Mon Jun 8 13:55:16 2015 @@ -56,6 +56,10 @@ public class ParserDecorator extends Abs public SetMediaType getSupportedTypes(ParseContext context) { return types; } +@Override +public String getDecorationName() { +return With Types; +} }; } @@ -81,6 +85,10 @@ public class ParserDecorator extends Abs // Return whatever is left return parserTypes; } +@Override +public String getDecorationName() { +return Without Types; +} }; } @@ -125,6 +133,10 @@ public class ParserDecorator extends Abs tstream.reset(); } } +@Override +public String getDecorationName() { +return With Fallback; +} }; } @@ -163,6 +175,12 @@ public class ParserDecorator extends Abs parser.parse(stream, handler, metadata, context); } +/** + * @return A name/description of the decoration, or null if none available + */ +public String getDecorationName() { +return null; +} /** * Gets the parser wrapped by this ParserDecorator
svn commit: r1684170 - in /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes: CTAKESAnnotationProperty.java CTAKESConfig.java CTAKESContentHandler.java CTAKESParser.java CTAKESSerial
Author: nick Date: Mon Jun 8 12:25:15 2015 New Revision: 1684170 URL: http://svn.apache.org/r1684170 Log: Fix indents to match http://tika.apache.org/contribute.html#Code_Formatting TIKA-1642 Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java?rev=1684170r1=1684169r2=1684170view=diff == --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java Mon Jun 8 12:25:15 2015 @@ -23,24 +23,24 @@ import org.apache.ctakes.typesystem.type * */ public enum CTAKESAnnotationProperty { - BEGIN(start), - END(end), - CONDITIONAL(conditional), - CONFIDENCE(confidence), - DISCOVERY_TECNIQUE(discoveryTechnique), - GENERIC(generic), - HISTORY_OF(historyOf), - ID(id), - ONTOLOGY_CONCEPT_ARR(ontologyConceptArr), - POLARITY(polarity); - - private String name; - - CTAKESAnnotationProperty(String name) { - this.name = name; - } - - public String getName() { - return name; - } +BEGIN(start), +END(end), +CONDITIONAL(conditional), +CONFIDENCE(confidence), +DISCOVERY_TECNIQUE(discoveryTechnique), +GENERIC(generic), +HISTORY_OF(historyOf), +ID(id), +ONTOLOGY_CONCEPT_ARR(ontologyConceptArr), +POLARITY(polarity); + +private String name; + +CTAKESAnnotationProperty(String name) { +this.name = name; +} + +public String getName() { +return name; +} } \ No newline at end of file Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java?rev=1684170r1=1684169r2=1684170view=diff == --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java Mon Jun 8 12:25:15 2015 @@ -24,314 +24,313 @@ import java.util.Properties; import org.apache.tika.io.NullOutputStream; -/* +/** * Configuration for {@see CTAKESContentHandler}. * * This class allows to enable cTAKES and set its parameters. - * */ public class CTAKESConfig implements Serializable { - /** -* Serial version UID -*/ - private static final long serialVersionUID = -1599741171775528923L; - - // Path to XML descriptor for AnalysisEngine - private String aeDescriptorPath = /ctakes-core/desc/analysis_engine/SentencesAndTokensAggregate.xml; - - // UMLS username - private String UMLSUser = ; - - // UMLS password - private String UMLSPass = ; - - // Enables formatted output - private boolean prettyPrint = true; - - // Type of cTAKES (UIMA) serializer - private CTAKESSerializer serializerType = CTAKESSerializer.XMI; - - // OutputStream object used for CAS serialization - private OutputStream stream = NullOutputStream.NULL_OUTPUT_STREAM; - - // Enables CAS serialization - private boolean serialize = false; - - // Enables text analysis using cTAKES - private boolean text = true; - - // List of metadata to analyze using cTAKES - private String[] metadata = null; - - // List of annotation properties to add to metadata in addition to text covered by an annotation - private CTAKESAnnotationProperty[] annotationProps = null; - - // Character used to separate the annotation properties into metadata - private char separatorChar = ':'; - - /** -* Default constructor. -*/ - public CTAKESConfig() { - init(this.getClass().getResourceAsStream(CTAKESConfig.properties)); - } - - /** -* Loads properties from InputStream
svn commit: r1684201 - /tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaParsers.java
Author: nick Date: Mon Jun 8 15:05:20 2015 New Revision: 1684201 URL: http://svn.apache.org/r1684201 Log: Make the nesting more visually obvious in the Server HTML parsers listing Modified: tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaParsers.java Modified: tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaParsers.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaParsers.java?rev=1684201r1=1684200r2=1684201view=diff == --- tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaParsers.java (original) +++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaParsers.java Mon Jun 8 15:05:20 2015 @@ -96,9 +96,11 @@ public class TikaParsers { } if (p.isComposite) { html.append(pComposite Parser/p); +html.append(div style=\margin-left: 1em\\n); for (Parser cp : p.childParsers) { parserAsHTML(new ParserDetails(cp), withMimeTypes, html, level + 1); } +html.append(/div\n); } else if (withMimeTypes) { html.append(pMime Types:); html.append(ul); @@ -110,6 +112,7 @@ public class TikaParsers { html.append(/ul); html.append(/p); } +html.append(\n); } @GET
svn commit: r1684206 - in /tika/trunk/tika-core/src/main/java/org/apache/tika: config/TikaConfig.java parser/ParserDecorator.java
Author: nick Date: Mon Jun 8 15:28:45 2015 New Revision: 1684206 URL: http://svn.apache.org/r1684206 Log: Allow Tika Config xml to have a ParserDecorator with child parsers, and note about how this can work in the javadocs Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1684206r1=1684205r2=1684206view=diff == --- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Mon Jun 8 15:28:45 2015 @@ -381,8 +381,10 @@ public class TikaConfig { + configuration element: + name); } -// Is this a composite parser? If so, support recursion -if (CompositeParser.class.isAssignableFrom(parserClass)) { +// Is this a composite or decorated parser? If so, support recursion +if (CompositeParser.class.isAssignableFrom(parserClass) || +ParserDecorator.class.isAssignableFrom(parserClass)) { + // Get the child parsers for it ListParser childParsers = new ArrayListParser(); NodeList childParserNodes = parserNode.getElementsByTagName(parser); @@ -407,20 +409,36 @@ public class TikaConfig { // Create the Composite Parser Constructor? extends Parser c = null; -if (c == null) { +MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry(); +if (parser == null) { try { c = parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class, Collection.class); -parser = c.newInstance(mimeTypes.getMediaTypeRegistry(), loader, excludeParsers); +parser = c.newInstance(registry, loader, excludeParsers); } catch (NoSuchMethodException me) {} } -if (c == null) { +if (parser == null) { try { c = parserClass.getConstructor(MediaTypeRegistry.class, List.class, Collection.class); -parser = c.newInstance(mimeTypes.getMediaTypeRegistry(), childParsers, excludeParsers); +parser = c.newInstance(registry, childParsers, excludeParsers); +} catch (NoSuchMethodException me) {} +} +// Create as a Parser Decorator +if (parser == null ParserDecorator.class.isAssignableFrom(parserClass)) { +try { +CompositeParser cp = null; +if (childParsers.size() == 1 excludeParsers.size() == 0 +childParsers.get(0) instanceof CompositeParser) { +cp = (CompositeParser)childParsers.get(0); +} else { +cp = new CompositeParser(registry, childParsers, excludeParsers); +} +c = parserClass.getConstructor(Parser.class); +parser = c.newInstance(cp); } catch (NoSuchMethodException me) {} } -if (c == null) { +// Default constructor +if (parser == null) { parser = parserClass.newInstance(); } } else { Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java?rev=1684206r1=1684205r2=1684206view=diff == --- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java Mon Jun 8 15:28:45 2015 @@ -30,10 +30,12 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; /** - * Decorator base class for the {@link Parser} interface. This class - * simply delegates all parsing calls to an underlying decorated parser - * instance. Subclasses can provide extra decoration by overriding the + * Decorator base class for the {@link Parser} interface. + * pThis class simply delegates all parsing calls to an underlying decorated + * parser instance. Subclasses can provide extra decoration by overriding the * parse method. + * pTo
svn commit: r1684207 - in /tika/trunk: tika-core/src/test/resources/org/apache/tika/config/TIKA-1642-CTakes.xml tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
Author: nick Date: Mon Jun 8 15:29:26 2015 New Revision: 1684207 URL: http://svn.apache.org/r1684207 Log: cTAKES config xml example and code example in JavaDocs TIKA-1642 Added: tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1642-CTakes.xml Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java Added: tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1642-CTakes.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1642-CTakes.xml?rev=1684207view=auto == --- tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1642-CTakes.xml (added) +++ tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1642-CTakes.xml Mon Jun 8 15:29:26 2015 @@ -0,0 +1,24 @@ +?xml version=1.0 encoding=UTF-8? +!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the License); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an AS IS BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +-- +properties + parsers +parser class=org.apache.tika.parser.ctakes.CTAKESParser + parser class=org.apache.tika.parser.DefaultParser/ +/parser + /parsers +/properties Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java?rev=1684207r1=1684206r2=1684207view=diff == --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java Mon Jun 8 15:29:26 2015 @@ -35,7 +35,7 @@ import org.xml.sax.SAXException; * clinical text using Apache cTAKES. * pIt is normally called by supplying an instance to * {@link AutoDetectParser}, such as: - * codeAutoDetectParser parser = new AutoDetectParser(new CTakesParser());/code + * codeAutoDetectParser parser = new AutoDetectParser(new CTAKESParser());/code * pIt can also be used by giving a Tika Config file similar to: * code * gt;properties
svn commit: r1684199 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/config/ tika-core/src/test/java/org/apache/tika/config/ tika-core/src/test/resources/org/apache/tika/config/ tika-parsers
Author: nick Date: Mon Jun 8 14:41:48 2015 New Revision: 1684199 URL: http://svn.apache.org/r1684199 Log: TIKA-1653 Re-do the XML parsing in the Tika Config, so that a parser tag with another inside it doesn't get accidently duplicated at the top level Added: tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1653-norepeat.xml Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1684199r1=1684198r2=1684199view=diff == --- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Mon Jun 8 14:41:48 2015 @@ -332,10 +332,25 @@ public class TikaConfig { Element element, MimeTypes mimeTypes, ServiceLoader loader) throws TikaException, IOException { ListParser parsers = new ArrayListParser(); -NodeList nodes = element.getElementsByTagName(parser); -for (int i = 0; i nodes.getLength(); i++) { -Element node = (Element) nodes.item(i); -parsers.add(parserFromParserDomElement(node, mimeTypes, loader)); + +// Should be only zero or one parsers tag +NodeList nodes = element.getElementsByTagName(parsers); +if (nodes.getLength() 1) { +throw new TikaException(Properties may not contain multiple Parsers entries); +} +else if (nodes.getLength() == 1) { +// Find only the direct child parser objects +Node parsersE = nodes.item(0); +nodes = parsersE.getChildNodes(); +for (int i = 0; i nodes.getLength(); i++) { +Node node = nodes.item(i); +if (node instanceof Element) { +Element nodeE = (Element)node; +if (parser.equals(nodeE.getTagName())) { +parsers.add(parserFromParserDomElement(nodeE, mimeTypes, loader)); +} +} +} } if (parsers.isEmpty()) { @@ -444,21 +459,26 @@ public class TikaConfig { private static SetMediaType mediaTypesListFromDomElement( Element node, String tag) throws TikaException, IOException { -NodeList mimes = node.getElementsByTagName(tag); -if (mimes.getLength() 0) { -SetMediaType types = new HashSetMediaType(); -for (int j = 0; j mimes.getLength(); j++) { -String mime = getText(mimes.item(j)); -MediaType type = MediaType.parse(mime); -if (type != null) { -types.add(type); -} else { -throw new TikaException( -Invalid media type name: + mime); +SetMediaType types = null; +NodeList children = node.getChildNodes(); +for (int i=0; ichildren.getLength(); i++) { +Node cNode = children.item(i); +if (cNode instanceof Element) { +Element cElement = (Element)cNode; +if (tag.equals(cElement.getTagName())) { +String mime = getText(cElement); +MediaType type = MediaType.parse(mime); +if (type != null) { +if (types == null) types = new HashSetMediaType(); +types.add(type); +} else { +throw new TikaException( +Invalid media type name: + mime); +} } } -return types; } +if (types != null) return types; return Collections.emptySet(); } Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java?rev=1684199r1=1684198r2=1684199view=diff == --- tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java (original) +++ tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java Mon Jun 8 14:41:48 2015 @@ -177,4 +177,39 @@ public class TikaConfigTest { System.clearProperty(tika.config); } } + +/** + * TIKA-1653 If one parser has child parsers, those child parsers shouldn't + * show up at the top level as well
svn commit: r1683076 - /tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Author: nick Date: Tue Jun 2 11:00:22 2015 New Revision: 1683076 URL: http://svn.apache.org/r1683076 Log: Try to make the low-priority padded PDF magic match more specific, as it looks to have incorrectly triggered on a few of the govdocs text files Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1683076r1=1683075r2=1683076view=diff == --- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original) +++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Tue Jun 2 11:00:22 2015 @@ -489,7 +489,8 @@ magic priority=20 !-- Low priority match for %PDF near the start of the file -- !-- Can trigger false positives, so set the priority rather low here -- - match value=%PDF- type=string offset=1:512/ + match value=%PDF-1. type=string offset=1:512/ + match value=%PDF-2. type=string offset=1:512/ /magic glob pattern=*.pdf/ /mime-type
svn commit: r1683101 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java tika-parsers/src/test/res
Author: nick Date: Tue Jun 2 13:15:21 2015 New Revision: 1683101 URL: http://svn.apache.org/r1683101 Log: Bibtex entries are case insensitive, and might start with a comment, so tweak magic and add a test file. (Spotted in govdocs1) Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testBIBTEX.bib Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1683101r1=1683100r2=1683101view=diff == --- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original) +++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Tue Jun 2 13:15:21 2015 @@ -487,7 +487,7 @@ match value=\xef\xbb\xbf%PDF- type=string offset=0/ /magic magic priority=20 - !-- Low priority match for %PDF near the start of the file -- + !-- Low priority match for %PDF-#.# near the start of the file -- !-- Can trigger false positives, so set the priority rather low here -- match value=%PDF-1. type=string offset=1:512/ match value=%PDF-2. type=string offset=1:512/ @@ -2793,17 +2793,34 @@ match value=%%%\ \ type=string offset=73/ match value=%\ BibTeX\ standard\ bibliography\ type=string offset=0/ match value=%%%\ \ @BibTeX-style-file{ type=string offset=73/ - match value=@article{ type=string offset=0/ - match value=@book{ type=string offset=0/ - match value=@inbook{ type=string offset=0/ - match value=@incollection{ type=string offset=0/ - match value=@inproceedings{ type=string offset=0/ - match value=@manual{ type=string offset=0/ - match value=@misc{ type=string offset=0/ - match value=@preamble{ type=string offset=0/ - match value=@phdthesis{ type=string offset=0/ - match value=@techreport{ type=string offset=0/ - match value=@unpublished{ type=string offset=0/ + match value=@article{ type=stringignorecase offset=0/ + match value=@book{ type=stringignorecase offset=0/ + match value=@inbook{ type=stringignorecase offset=0/ + match value=@incollection{ type=stringignorecase offset=0/ + match value=@inproceedings{ type=stringignorecase offset=0/ + match value=@manual{ type=stringignorecase offset=0/ + match value=@misc{ type=stringignorecase offset=0/ + match value=@preamble{ type=stringignorecase offset=0/ + match value=@phdthesis{ type=stringignorecase offset=0/ + match value=@string{ type=stringignorecase offset=0/ + match value=@techreport{ type=stringignorecase offset=0/ + match value=@unpublished{ type=stringignorecase offset=0/ +/magic +magic priority=30 + match value=% type=string offset=0 + match value=\n@article{ type=stringignorecase offset=2:128/ + match value=\n@book{ type=stringignorecase offset=2:128/ + match value=\n@inbook{ type=stringignorecase offset=2:128/ + match value=\n@incollection{ type=stringignorecase offset=2:128/ + match value=\n@inproceedings{ type=stringignorecase offset=2:128/ + match value=\n@manual{ type=stringignorecase offset=2:128/ + match value=\n@misc{ type=stringignorecase offset=2:128/ + match value=\n@preamble{ type=stringignorecase offset=2:128/ + match value=\n@phdthesis{ type=stringignorecase offset=2:128/ + match value=\n@string{ type=stringignorecase offset=2:128/ + match value=\n@techreport{ type=stringignorecase offset=2:128/ + match value=\n@unpublished{ type=stringignorecase offset=2:128/ + /match /magic glob pattern=*.bib/ glob pattern=*.bibtex/ Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1683101r1=1683100r2=1683101view=diff == --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Tue Jun 2 13:15:21 2015 @@ -932,6 +932,12 @@ public class TestMimeTypes { } @Test +public void testTextFormats() throws Exception { +assertType(application/x-bibtex-text-file, testBIBTEX.bib); +assertTypeByData(application/x-bibtex-text-file, testBIBTEX.bib); +} + +@Test public void testCodeFormats() throws Exception { assertType(text/x-csrc, testC.c); Added: tika/trunk/tika-parsers/src/test
svn commit: r1683107 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java tika-parsers/src/test/res
Author: nick Date: Tue Jun 2 13:33:55 2015 New Revision: 1683107 URL: http://svn.apache.org/r1683107 Log: TIKA-1634 Few more matlab and other code related tests Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testH.h Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1683107r1=1683106r2=1683107view=diff == --- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original) +++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Tue Jun 2 13:33:55 2015 @@ -5580,6 +5580,9 @@ mime-type type=text/x-chdr _commentC source code header/_comment glob pattern=*.h/ +magic priority=30 + match value=#ifndef type=string offset=0/ +/magic sub-class-of type=text/plain/ /mime-type @@ -5598,6 +5601,9 @@ mime-type type=text/x-csrc alias type=text/x-c/ _commentC source code/_comment +magic priority=30 + match value=#include type=string offset=0/ +/magic glob pattern=*.c/ sub-class-of type=text/plain/ /mime-type Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1683107r1=1683106r2=1683107view=diff == --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Tue Jun 2 13:33:55 2015 @@ -940,10 +940,16 @@ public class TestMimeTypes { @Test public void testCodeFormats() throws Exception { assertType(text/x-csrc, testC.c); +assertType(text/x-chdr, testH.h); +assertTypeByData(text/x-csrc, testC.c); +assertTypeByData(text/x-chdr, testH.h); assertType(text/x-matlab, testMATLAB.m); assertType(text/x-matlab, testMATLAB_wtsgaus.m); assertType(text/x-matlab, testMATLAB_barcast.m); +assertTypeByData(text/x-matlab, testMATLAB.m); +assertTypeByData(text/x-matlab, testMATLAB_wtsgaus.m); +assertTypeByData(text/x-matlab, testMATLAB_barcast.m); } private void assertText(byte[] prefix) throws IOException { Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testH.h URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testH.h?rev=1683107view=auto == --- tika/trunk/tika-parsers/src/test/resources/test-documents/testH.h (added) +++ tika/trunk/tika-parsers/src/test/resources/test-documents/testH.h Tue Jun 2 13:33:55 2015 @@ -0,0 +1,5 @@ +#ifndef TIKA_HELLO_WORLD +#define TIKA_HELLO_WORLD + +#define HELLO world +#endif
svn commit: r1681337 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/pkg/CompressorParser.java test/java/org/apache/tika/parser/pkg/ZlibParserTest.java
Author: nick Date: Sat May 23 12:21:05 2015 New Revision: 1681337 URL: http://svn.apache.org/r1681337 Log: TIKA-1635 Disabled zlib parser support, not yet enabled pending a fix for a commons compress bug Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java?rev=1681337r1=1681336r2=1681337view=diff == --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java Sat May 23 12:21:05 2015 @@ -27,6 +27,7 @@ import org.apache.commons.compress.compr import org.apache.commons.compress.compressors.CompressorInputStream; import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; +import org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream; import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; import org.apache.commons.compress.compressors.gzip.GzipUtils; import org.apache.commons.compress.compressors.pack200.Pack200CompressorInputStream; @@ -57,6 +58,8 @@ public class CompressorParser extends Ab private static final MediaType GZIP_ALT = MediaType.application(x-gzip); private static final MediaType XZ = MediaType.application(x-xz); private static final MediaType PACK = MediaType.application(application/x-java-pack200); +// TODO Not yet supported by CompressorStreamFactory, see COMPRESS-316 +private static final MediaType ZLIB = MediaType.application(zlib); private static final SetMediaType SUPPORTED_TYPES = MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, XZ, PACK); @@ -73,6 +76,8 @@ public class CompressorParser extends Ab return GZIP; } else if (stream instanceof XZCompressorInputStream) { return XZ; +} else if (stream instanceof DeflateCompressorInputStream) { +return ZLIB; } else if (stream instanceof Pack200CompressorInputStream) { return PACK; } else { @@ -133,6 +138,8 @@ public class CompressorParser extends Ab name = name.substring(0, name.length() - 4); } else if (name.endsWith(.xz)) { name = name.substring(0, name.length() - 3); +} else if (name.endsWith(.zlib)) { +name = name.substring(0, name.length() - 5); } else if (name.endsWith(.pack)) { name = name.substring(0, name.length() - 5); } else if (name.length() 0) { Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java?rev=1681337view=auto == --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java (added) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java Sat May 23 12:21:05 2015 @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import static org.junit.Assert.assertEquals; + +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Ignore; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +/** + * Test case for parsing zlib compressed + * + * Note - currently disabled, pending a fix for COMPRESS-316 + */ +public class ZlibParserTest extends AbstractPkgTest { +@Test
svn commit: r1681351 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Author: nick Date: Sat May 23 14:05:20 2015 New Revision: 1681351 URL: http://svn.apache.org/r1681351 Log: TIKA-1634 Two more kinds of matlab magic, and tests Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1681351r1=1681350r2=1681351view=diff == --- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original) +++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Sat May 23 14:05:20 2015 @@ -5752,9 +5752,23 @@ mime-type type=text/x-matlab _commentMatlab source code/_comment +!-- Multiple-output function definition -- magic priority=50 match value=function [ type=string offset=0/ /magic +!-- Single-output or no output function definition -- +magic priority=40 + match value=function [a-zA-Z][A-Za-z0-9_]{0,5} type=regex offset=0/ +/magic +!-- Two matlab-style comments fairly early in the file -- +magic priority=25 + match value=% type=string offset=0 + match value=\n% type=string offset=2:120/ + /match + match value=% type=string offset=0 + match value=\r% type=string offset=2:120/ + /match +/magic !-- glob pattern=*.m/ - conflicts with text/x-objcsrc -- sub-class-of type=text/plain/ /mime-type Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1681351r1=1681350r2=1681351view=diff == --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Sat May 23 14:05:20 2015 @@ -930,6 +930,15 @@ public class TestMimeTypes { assertTypeByData(application/zlib, testTXT.zlib5); assertTypeByData(application/zlib, testTXT.zlib9); } + +@Test +public void testCodeFormats() throws Exception { +assertType(text/x-csrc, testC.c); + +assertType(text/x-matlab, testMATLAB.m); +assertType(text/x-matlab, testMATLAB_wtsgaus.m); +assertType(text/x-matlab, testMATLAB_barcast.m); +} private void assertText(byte[] prefix) throws IOException { assertMagic(text/plain, prefix);
svn commit: r1681348 - /tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Author: nick Date: Sat May 23 13:43:27 2015 New Revision: 1681348 URL: http://svn.apache.org/r1681348 Log: Add an alternate zlib mimetype found in some places Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1681348r1=1681347r2=1681348view=diff == --- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original) +++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Sat May 23 13:43:27 2015 @@ -4006,6 +4006,7 @@ /mime-type mime-type type=application/zlib +alias type=application/x-deflate/ _commentZLIB Compressed Data Format/_comment tika:linkhttp://tools.ietf.org/html/rfc1950/tika:link magic priority=45
svn commit: r1681349 - in /tika/trunk/tika-parsers/src/test/resources/test-documents: testMATLAB.m testMATLAB_barcast.m testMATLAB_wtsgaus.m
Author: nick Date: Sat May 23 13:51:55 2015 New Revision: 1681349 URL: http://svn.apache.org/r1681349 Log: TIKA-1634 Add some sample matlab files Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testMATLAB.m tika/trunk/tika-parsers/src/test/resources/test-documents/testMATLAB_barcast.m tika/trunk/tika-parsers/src/test/resources/test-documents/testMATLAB_wtsgaus.m Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testMATLAB.m URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testMATLAB.m?rev=1681349view=auto == --- tika/trunk/tika-parsers/src/test/resources/test-documents/testMATLAB.m (added) +++ tika/trunk/tika-parsers/src/test/resources/test-documents/testMATLAB.m Sat May 23 13:51:55 2015 @@ -0,0 +1,4 @@ +function helloworld +fprintf('Hello, World!\n') +disp('Hello, World!'); +end Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testMATLAB_barcast.m URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testMATLAB_barcast.m?rev=1681349view=auto == --- tika/trunk/tika-parsers/src/test/resources/test-documents/testMATLAB_barcast.m (added) +++ tika/trunk/tika-parsers/src/test/resources/test-documents/testMATLAB_barcast.m Sat May 23 13:51:55 2015 @@ -0,0 +1,383 @@ +%% CONTROL CODE FOR FULLY BAYESIAN SPATIO-TEMPORAL TEMPERATURE RECONSTRUCTION +%EVERYTHING IS MODULAR TO ALLOW FOR EASY DEBUGGING AND ADAPTATION +% _vNewModel_Oct08: change the formalism to reflect new model (Beta_1 now +% normal). Allows for multiple proxies +clear all; close all; +%SET MATLAB'S CURRENT DIRECTORY TO HERE. +% set the priors and the inital values for the MCMC sampler +Prior_pars_vNewModel +Initial_par_vals_vNewModel +%% Set the seed of the random number generators +randn('state', sum((1000+600)*clock)) +rand('state', sum((1000+800)*clock)) + +%% load the data +cd TestData +load BARCAST_INPUT_vNewMeth1 +%break it apart +Locs=BARCAST_INPUT.Master_Locs; +N_Locs=length(Locs(:,1)); %Number of locations: +timeline=[BARCAST_INPUT.Data_timeline(1)-1, BARCAST_INPUT.Data_timeline]; +N_Times=length(timeline)-1; %Number of DATA times +loc_areas=BARCAST_INPUT.Areas; +Inds_GridLocs_Central=BARCAST_INPUT.Inds_Central; + +%get the number of proxy types: +N_PT=length(fieldnames(BARCAST_INPUT))-5; + +%stack the three data matrices, one on top of the other +%the first N_Locs ROWS are the Inst, the next N_Locs ROWS the first proxy +%type, the next the third. . . . .. Each column a year. The first +%corresponds to the SECOND entry in timeline. +Data_ALL=BARCAST_INPUT.Inst_Data; +for kk=1:1:N_PT +tp=eval(['BARCAST_INPUT.Prox_Data', num2str(kk)]); +Data_ALL=[Data_ALL; tp]; +end + +% % % % All_locs_wInd=BARCAST_INPUT.All_locs_wInd; +% % % % lon_lat_area=BARCAST_INPUT.lon_lat_area; +% % % % DATA_Mat=BARCAST_INPUT.DATA_Mat; +% % % % DATA_Mat_locs=BARCAST_INPUT.DATA_Mat_locs; +% % % % Inds_GridLocs_Central=BARCAST_INPUT.Inds_GridLocs_Central; +% % % % timeline=BARCAST_INPUT.timeline; +% % % % clear BARCAST_INPUT + +%Priors and MH jumping parameters, from Prior_pars_vNewModel +load PRIORS_vNewMeth1 +load MHpars_vNewMeth1 +%Initial values from Initial_par_vals_vNewModel +load INITIAL_VALS_vNewMeth1 + +%The Order of THE SCALAR parameters WILL ALWAYS thus: +%1 = alpha, the AR(1) coefficient +%2 = mu, the constant par in the linear mean of the AR(1) process +%3 = sigma2, the partial sill in the spatial covariance matrix +%4 = phi, the range parameter in the spatial covariance matrix +%5 = tau2_I, the Inst measurement error +%6 = tau2_P, the measurement error, first PROX type +%7 = Beta_1, the scaling par in the first P observation equation +%8 = Beta_0, the additive par in the first P observation equation +%and, if there is second proxy type +%9 = tau2_P_2, the measurement error, second PROX type +%10 = Beta_1, the scaling par in the second P observation equation +%11 = Beta_0, the additive par in the second P observation equation +%and, if there is third proxy type . . . . + +%A NOTE ON GAMMA NOTATION. WE USE THE NOTATION OF Gelman et al, Bayesian +%Data Analysis, WHERE GAMMA PARAMETERS ALPHA, BETA)==(SHAPE, INVERSE SCALE). +%THE RANDRAW.M CODE USES (A,B)==(SHAPE, SCALE), AND THE CALL IS RANDRAW('GAMMA', [M,B,A], SAMPLESIZE), +%WHERE M IS THE LOCATION (NOT NEEDED). SO IN THE NOTATION OF GELMAN ET AT, THE CALL IS +%RANDRAW('GAMMA', [0,1/BETA,ALPHA], SAMPLESIZE). +%For example, +%RANDRAW('GAMMA', [0,1/PRIORS.sigma2(2),PRIORS.sigma2(1)], 1), AND ETC. + +%switch back tot he main directory +cd .. +%% SET a few parameters +%Number of iterations of the complete sampler +Sampler_Its=2000; + +%Number of times to update only the temperature array before beginning to +%update the other parameters +pre_Sampler_Its=500; + + +%% Areal weights vector
svn commit: r1680957 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/ tika-parsers/src/test/java/org/apache/tika/mime/ tika-parsers/src/test/java/org/apache/tika/parser/pdf/ tika-p
Author: nick Date: Thu May 21 21:49:11 2015 New Revision: 1680957 URL: http://svn.apache.org/r1680957 Log: TIKA-1085 Treat a PDF with a leading Byte Order Mark the same for detection, and add low-priorty matches for the PDF magic coming in 1-1024 bytes of the start (may give false positives if too high), plus tests Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bom.pdf (with props) Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1680957r1=1680956r2=1680957view=diff == --- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original) +++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Thu May 21 21:49:11 2015 @@ -481,7 +481,15 @@ tika:linkhttp://www.adobe.com/devnet/pdf/pdf_reference_archive.html/tika:link tika:uticom.adobe.pdf/tika:uti magic priority=50 + !-- Normally just %PDF- -- match value=%PDF- type=string offset=0/ + !-- Sometimes has a UTF-8 Byte Order Mark first -- + match value=\xef\xbb\xbf%PDF- type=string offset=0/ +/magic +magic priority=20 + !-- Low priority match for %PDF near the start of the file -- + !-- Can trigger false positives, so set the priority rather low here -- + match value=%PDF- type=string offset=1:512/ /magic glob pattern=*.pdf/ /mime-type Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1680957r1=1680956r2=1680957view=diff == --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Thu May 21 21:49:11 2015 @@ -501,10 +501,17 @@ public class TestMimeTypes { @Test public void testPdfDetection() throws Exception { -assertType(application/pdf, testPDF.pdf); -assertTypeByData(application/pdf, testPDF.pdf); +// PDF extension by name is enough assertTypeByName(application/pdf, x.pdf); assertTypeByName(application/pdf, x.PDF); + +// For normal PDFs, can get by name or data or both +assertType(application/pdf, testPDF.pdf); +assertTypeByData(application/pdf, testPDF.pdf); + +// PDF with a BoM works both ways too +assertType(application/pdf, testPDF_bom.pdf); +assertTypeByData(application/pdf, testPDF_bom.pdf); } @Test Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1680957r1=1680956r2=1680957view=diff == --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Thu May 21 21:49:11 2015 @@ -652,6 +652,8 @@ public class PDFParserTest extends TikaT knownMetadataDiffs.add(testAnnotations.pdf); // Added for TIKA-93. knownMetadataDiffs.add(testOCR.pdf); +// Added for TIKA-1085 +knownMetadataDiffs.add(testPDF_bom.pdf); //empty for now SetString knownContentDiffs = new HashSetString(); Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bom.pdf URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bom.pdf?rev=1680957view=auto == Binary file - no diff available. Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bom.pdf -- svn:mime-type = application/octet-stream
svn commit: r1680959 - /tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Author: nick Date: Thu May 21 22:13:08 2015 New Revision: 1680959 URL: http://svn.apache.org/r1680959 Log: TIKA-1632 zlib mime magic from Pavel Micka Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1680959r1=1680958r2=1680959view=diff == --- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original) +++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Thu May 21 22:13:08 2015 @@ -4005,6 +4005,19 @@ glob pattern=*.zip/ /mime-type + mime-type type=application/zlib +_commentZLIB Compressed Data Format/_comment +tika:linkhttp://tools.ietf.org/html/rfc1950/tika:link +magic priority=45 + !-- Low compression -- + match value=\x78\x01 type=string offset=0 / + !-- Default compression -- + match value=\x78\x9c type=string offset=0 / + !-- Best compression -- + match value=\x78\xda type=string offset=0 / +/magic + /mime-type + mime-type type=application/x-7z-compressed acronym7zip/acronym _comment7-zip archive/_comment
svn commit: r1680358 - in /tika/site: publish/1.7/formats.html publish/1.8/formats.html publish/1.9/formats.html src/site/apt/1.7/formats.apt src/site/apt/1.8/formats.apt src/site/apt/1.9/formats.apt
Author: nick Date: Tue May 19 17:57:53 2015 New Revision: 1680358 URL: http://svn.apache.org/r1680358 Log: Update the formats to make it clearer that these are the parser-supported formats, and more get detection-only Modified: tika/site/publish/1.7/formats.html tika/site/publish/1.8/formats.html tika/site/publish/1.9/formats.html tika/site/src/site/apt/1.7/formats.apt tika/site/src/site/apt/1.8/formats.apt tika/site/src/site/apt/1.9/formats.apt Modified: tika/site/publish/1.7/formats.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.7/formats.html?rev=1680358r1=1680357r2=1680358view=diff == --- tika/site/publish/1.7/formats.html (original) +++ tika/site/publish/1.7/formats.html Tue May 19 17:57:53 2015 @@ -86,7 +86,8 @@ div id=content !-- Licensed to the Apache Software Foundation (ASF) under one or more --!-- contributor license agreements. See the NOTICE file distributed with --!-- this work for additional information regarding copyright ownership. --!-- The ASF licenses this file to You under the Apache License, Version 2.0 --!-- (the License); you may not use this file except in compliance with --!-- the License. You may obtain a copy of the License at --!-- --!-- http://www.apache.org/licenses/LICENSE-2.0 --!-- --!-- Unless required by applicable law or agreed to in writing, software --!-- distributed under the License is distributed on an AS IS BASIS, --!-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. --!-- See the License for the specific language governing permissions and --!-- limitations under the License. --div class=section h2Supported Document Formatsa name=Supported_Document_Formats/a/h2 -pThis page lists all the document formats supported by Apache Tika 1.7. Follow the links to the various parser class javadocs for more detailed information about each document format and how it is parsed by Tika./p +pThis page lists all the document formats supported by the parsers in Apache Tika 1.7. Follow the links to the various parser class javadocs for more detailed information about each document format and how it is parsed by Tika./p +p(Please note that Apache Tika is able to detect a much wider range of formats than those listed below, this page only documents those formats from which Tika is able to extract metadata and/or textual content)/p ul lia href=#Supported_Document_FormatsSupported Document Formats/a ul Modified: tika/site/publish/1.8/formats.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.8/formats.html?rev=1680358r1=1680357r2=1680358view=diff == --- tika/site/publish/1.8/formats.html (original) +++ tika/site/publish/1.8/formats.html Tue May 19 17:57:53 2015 @@ -86,7 +86,8 @@ div id=content !-- Licensed to the Apache Software Foundation (ASF) under one or more --!-- contributor license agreements. See the NOTICE file distributed with --!-- this work for additional information regarding copyright ownership. --!-- The ASF licenses this file to You under the Apache License, Version 2.0 --!-- (the License); you may not use this file except in compliance with --!-- the License. You may obtain a copy of the License at --!-- --!-- http://www.apache.org/licenses/LICENSE-2.0 --!-- --!-- Unless required by applicable law or agreed to in writing, software --!-- distributed under the License is distributed on an AS IS BASIS, --!-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. --!-- See the License for the specific language governing permissions and --!-- limitations under the License. --div class=section h2Supported Document Formatsa name=Supported_Document_Formats/a/h2 -pThis page lists all the document formats supported by Apache Tika 1.8. Follow the links to the various parser class javadocs for more detailed information about each document format and how it is parsed by Tika./p +pThis page lists all the document formats supported by the parsers in Apache Tika 1.8. Follow the links to the various parser class javadocs for more detailed information about each document format and how it is parsed by Tika./p +p(Please note that Apache Tika is able to detect a much wider range of formats than those listed below, this page only documents those formats from which Tika is able to extract metadata and/or textual content)/p ul lia href=#Supported_Document_FormatsSupported Document Formats/a ul Modified: tika/site/publish/1.9/formats.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.9/formats.html?rev=1680358r1=1680357r2=1680358view=diff == --- tika/site/publish/1.9/formats.html (original) +++ tika/site/publish/1.9/formats.html Tue May 19 17:57:53 2015 @@ -86,7 +86,8 @@ div id