[4/4] tika git commit: PKCS7 signature detection tests, using test files from TIKA-1821

2016-02-03 Thread nick
PKCS7 signature detection tests, using test files from TIKA-1821


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/046e43f8
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/046e43f8
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/046e43f8

Branch: refs/heads/master
Commit: 046e43f81c37f0ab92d9420fab8b645969d5a13c
Parents: 57ae2c5
Author: Nick Burch <n...@gagravarr.org>
Authored: Wed Feb 3 14:13:30 2016 +
Committer: Nick Burch <n...@gagravarr.org>
Committed: Wed Feb 3 14:13:30 2016 +

--
 .../test/java/org/apache/tika/mime/TestMimeTypes.java  | 13 +
 1 file changed, 13 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/tika/blob/046e43f8/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
--
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 2f9193d..77d25df 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -979,6 +979,19 @@ public class TestMimeTypes {
 assertTypeByData("text/vtt", "testWebVTT.vtt");
 }
 
+@Test
+public void testPKCSSignatures() throws Exception {
+// PKCS7 Signed XML files
+assertType("application/pkcs7-signature", "testPKCS17Sig.xml.p7m");
+assertType("application/pkcs7-signature", "testPKCS17Sig-v2.xml.p7m");
+assertType("application/pkcs7-signature", "testPKCS17Sig-v3.xml.p7m");
+assertType("application/pkcs7-signature", "testPKCS17Sig-v4.xml.p7m");
+assertTypeByData("application/pkcs7-signature", 
"testPKCS17Sig.xml.p7m");
+assertTypeByData("application/pkcs7-signature", 
"testPKCS17Sig-v2.xml.p7m");
+assertTypeByData("application/pkcs7-signature", 
"testPKCS17Sig-v3.xml.p7m");
+assertTypeByData("application/pkcs7-signature", 
"testPKCS17Sig-v4.xml.p7m");
+}
+
 private void assertText(byte[] prefix) throws IOException {
 assertMagic("text/plain", prefix);
 }



[3/4] tika git commit: Unit test for detecting JS files

2016-02-03 Thread nick
Unit test for detecting JS files

As we don't currently have any JS file magic, we can't detect
as such without the file name. However, with the filename, ensure
we do get it right, even if there's HTML snippet in the JS. TIKA-1141


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/557b3704
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/557b3704
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/557b3704

Branch: refs/heads/master
Commit: 557b3704501a9692809a3e1b7838866786ed3366
Parents: d740f5d
Author: Nick Burch <n...@gagravarr.org>
Authored: Wed Feb 3 17:20:55 2016 +
Committer: Nick Burch <n...@gagravarr.org>
Committed: Wed Feb 3 17:25:52 2016 +

--
 .../test/java/org/apache/tika/mime/TestMimeTypes.java   | 12 
 1 file changed, 12 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/tika/blob/557b3704/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
--
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 77d25df..92f7b88 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -971,6 +971,18 @@ public class TestMimeTypes {
 assertTypeByData("text/x-matlab", "testMATLAB.m");
 assertTypeByData("text/x-matlab", "testMATLAB_wtsgaus.m");
 assertTypeByData("text/x-matlab", "testMATLAB_barcast.m");
+
+// By name, or by name+data, gets it as JS
+assertTypeByName("application/javascript", "testJS.js");
+assertTypeByName("application/javascript", "testJS_HTML.js");
+assertType("application/javascript", "testJS.js");
+assertType("application/javascript", "testJS_HTML.js");
+
+// With data only, because we have no JS file magic, can't be
+//  detected. One will come through as plain text, the other
+//  as HTML due to  in it. TODO Add JS magic. See TIKA-1141 
+//assertTypeByData("application/javascript", "testJS.js");
+//assertTypeByData("application/javascript", "testJS_HTML.js");
 }
 
 @Test



[2/4] tika git commit: Lower the priority of

2016-02-03 Thread nick
Lower the priority of http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d740f5d8
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d740f5d8
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d740f5d8

Branch: refs/heads/master
Commit: d740f5d8b2e42b1db42806ddd395e034cb416fd4
Parents: d8a2fc0
Author: Nick Burch <n...@gagravarr.org>
Authored: Wed Feb 3 17:11:06 2016 +
Committer: Nick Burch <n...@gagravarr.org>
Committed: Wed Feb 3 17:11:06 2016 +

--
 .../org/apache/tika/mime/tika-mimetypes.xml| 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/tika/blob/d740f5d8/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
--
diff --git 
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 1d7b42b..95f41e6 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -5432,12 +5432,6 @@
   
   
   
-  
-  
   
   
   
@@ -5449,6 +5443,17 @@
   
   
   
+  
+
+
+  
+  
+  
 
 
 



[4/4] tika git commit: Merge branch 'master' of https://git-wip-us.apache.org/repos/asf/tika

2016-02-03 Thread nick
Merge branch 'master' of https://git-wip-us.apache.org/repos/asf/tika


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/6c0b7906
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/6c0b7906
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/6c0b7906

Branch: refs/heads/master
Commit: 6c0b7906ecbc22ea9adb4c1e5781b0eff561957d
Parents: 557b370 1e0159b
Author: Nick Burch <n...@gagravarr.org>
Authored: Wed Feb 3 17:26:09 2016 +
Committer: Nick Burch <n...@gagravarr.org>
Committed: Wed Feb 3 17:26:09 2016 +

--
 .../tika/parser/rtf/RTFEmbObjHandler.java   |   2 +-
 .../apache/tika/parser/rtf/RTFParserTest.java   |   9 +-
 .../tika/server/resource/TikaResource.java  |  14 +-
 .../apache/tika/server/TikaResourceTest.java|  12 +
 .../testRTF_npeFromWMFInTikaServer.rtf  | 235 +++
 5 files changed, 262 insertions(+), 10 deletions(-)
--




[1/4] tika git commit: Test JS file that includes

2016-02-03 Thread nick
Repository: tika
Updated Branches:
  refs/heads/master 1e0159b73 -> 6c0b7906e


Test JS file that includes http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d8a2fc01
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d8a2fc01
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d8a2fc01

Branch: refs/heads/master
Commit: d8a2fc01b4da5ffb7be19864512401c54aa04bfd
Parents: 046e43f
Author: Nick Burch <n...@gagravarr.org>
Authored: Wed Feb 3 17:10:33 2016 +
Committer: Nick Burch <n...@gagravarr.org>
Committed: Wed Feb 3 17:10:33 2016 +

--
 .../resources/test-documents/testJS_HTML.js | 91 
 1 file changed, 91 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/tika/blob/d8a2fc01/tika-parsers/src/test/resources/test-documents/testJS_HTML.js
--
diff --git a/tika-parsers/src/test/resources/test-documents/testJS_HTML.js 
b/tika-parsers/src/test/resources/test-documents/testJS_HTML.js
new file mode 100644
index 000..a362198
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testJS_HTML.js
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+var places = new Array();
+
+places[0] = {
+   'name': 'Oxford', lat: 51.75222, lng: -1.25596,
+   'id': 'map_1',
+}
+places[1] = {
+   'name': 'Oxford', lat: 41.43399, lng: -73.11678,
+   'id': 'map_2',
+}
+places[2] = {
+   'name': 'Oxford', lat: -43.3, lng: 172.18333,
+   'id': 'map_3',
+}
+places[3] = {
+   'name': 'Oxford', lat: 33.619, lng: -83.86741,
+   'id': 'map_4',
+}
+places[4] = {
+   'name': 'Oxford', lat: 44.13174, lng: -70.49311,
+   'id': 'map_5',
+}
+places[5] = {
+   'name': 'Oxford', lat: 39.78539, lng: -75.97883,
+   'id': 'map_6',
+}
+places[6] = {
+   'name': 'Oxford', lat: 40.51976, lng: -87.24779,
+   'id': 'map_7',
+}
+places[7] = {
+   'name': 'Oxford', lat: 45.73345, lng: -63.86542,
+   'id': 'map_8',
+}
+places[8] = {
+   'name': 'Oxford', lat: 42.44202, lng: -75.59769,
+   'id': 'map_9',
+}
+places[9] = {
+   'name': 'Oxford', lat: 40.80315, lng: -74.98962,
+   'id': 'map_10',
+}
+
+function drawMaps() {
+   if (GBrowserIsCompatible()) {
+  for(var i in places) {
+ var p = places[i];
+ var div = document.getElementById(p['id']);
+
+ div.style.display = "block";
+ div.parentNode.style.marginBottom = "35px";
+
+ var map = new GMap2(div);
+ map.setCenter(new GLatLng(p['lat'], p['lng']), 8);
+
+ var m = new GMarker( 
+new GLatLng(p['lat'], p['lng']),
+{title: p['name']}
+ );
+ map.addOverlay(m);
+  }
+   } else {
+  document.write("Unsupported 
Browser");
+   }
+}
+
+var t;
+$(document).ready(function(){
+  t = setTimeout(function() {
+ clearTimeout(t);
+ drawMaps();
+  }, 15*1000);
+});



[1/2] tika git commit: TIKA-1823 Sample AutoCAD 2010 DWF file

2016-01-26 Thread nick
Repository: tika
Updated Branches:
  refs/heads/master 5c0ef63e4 -> 6a0923326


TIKA-1823 Sample AutoCAD 2010 DWF file

AutoCAD supplied sample file for AutoCAD 2010 DWF, from
https://knowledge.autodesk.com/support/autocad/downloads/caas/downloads/content/autocad-sample-files.html


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/38fbc504
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/38fbc504
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/38fbc504

Branch: refs/heads/master
Commit: 38fbc504944732f6aefddc3ce7e802a5103b6f89
Parents: 5c0ef63
Author: Nick Burch <n...@gagravarr.org>
Authored: Tue Jan 26 16:26:55 2016 +
Committer: Nick Burch <n...@gagravarr.org>
Committed: Tue Jan 26 16:26:55 2016 +

--
 .../test/resources/test-documents/testDWF2010.dwf | Bin 0 -> 101370 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/tika/blob/38fbc504/tika-parsers/src/test/resources/test-documents/testDWF2010.dwf
--
diff --git a/tika-parsers/src/test/resources/test-documents/testDWF2010.dwf 
b/tika-parsers/src/test/resources/test-documents/testDWF2010.dwf
new file mode 100644
index 000..f72f4e6
Binary files /dev/null and 
b/tika-parsers/src/test/resources/test-documents/testDWF2010.dwf differ



[2/2] tika git commit: TIKA-1823 AutoCAD DWF mime magic and subtypes

2016-01-26 Thread nick
TIKA-1823 AutoCAD DWF mime magic and subtypes

Parent AutoCAD DWF mimetype and general magic, based on patch from
Luca Moretti, along with version-specific subtypes with more specific
magic


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/6a092332
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/6a092332
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/6a092332

Branch: refs/heads/master
Commit: 6a092332686e02ba26456e52fb0ce8bc5b42be56
Parents: 38fbc50
Author: Nick Burch <n...@gagravarr.org>
Authored: Tue Jan 26 16:31:17 2016 +
Committer: Nick Burch <n...@gagravarr.org>
Committed: Tue Jan 26 16:31:17 2016 +

--
 .../org/apache/tika/mime/tika-mimetypes.xml | 30 
 .../org/apache/tika/mime/TestMimeTypes.java |  5 
 2 files changed, 35 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/tika/blob/6a092332/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
--
diff --git 
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 5d152a5..1d7b42b 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -5250,8 +5250,38 @@
   
 
   
+DWF
+<_comment>AutoCAD Design Web Format
+
+
+  
+ 
+
+ 
+  
+
 
   
+  
+
+
+  
+ 
+  
+
+
+  
+  
+
+  
+
+  
+  
+
+  
+
+  
+
   
 <_comment>AutoCAD Design Web Format
 

http://git-wip-us.apache.org/repos/asf/tika/blob/6a092332/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
--
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 28aae81..2f9193d 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -538,6 +538,11 @@ public class TestMimeTypes {
 assertTypeByData("image/vnd.dwg", "testDWG2010.dwg");
 
 // From name, gets the common parent type
+assertTypeByName("model/vnd.dwf", "x.dwf");
+// With the data, can work out it's the v6 zip-based flavour
+assertTypeByData("model/vnd.dwf; version=6", "testDWF2010.dwf");
+
+// From name, gets the common parent type
 assertTypeByName("image/vnd.dxf", "x.dxf");
 // With the data, can work out it's the ASCII flavour
 assertTypeByData("image/vnd.dxf; format=ascii", "testDXF_ascii.dxf");



svn commit: r1726164 - in /tika/site/src/site/apt: 0.10/ 0.7/ 0.8/ 0.9/ 1.0/ 1.1/ 1.10/ 1.11/ 1.2/ 1.3/ 1.4/ 1.5/ 1.6/ 1.7/ 1.8/ 1.9/

2016-01-22 Thread nick
Author: nick
Date: Fri Jan 22 09:37:17 2016
New Revision: 1726164

URL: http://svn.apache.org/viewvc?rev=1726164=rev
Log:
Remove odd extra header line

Modified:
tika/site/src/site/apt/0.10/parser_guide.apt
tika/site/src/site/apt/0.7/parser_guide.apt
tika/site/src/site/apt/0.8/parser_guide.apt
tika/site/src/site/apt/0.9/parser_guide.apt
tika/site/src/site/apt/1.0/parser_guide.apt
tika/site/src/site/apt/1.1/parser_guide.apt
tika/site/src/site/apt/1.10/parser_guide.apt
tika/site/src/site/apt/1.11/parser_guide.apt
tika/site/src/site/apt/1.2/parser_guide.apt
tika/site/src/site/apt/1.3/parser_guide.apt
tika/site/src/site/apt/1.4/parser_guide.apt
tika/site/src/site/apt/1.5/parser_guide.apt
tika/site/src/site/apt/1.6/parser_guide.apt
tika/site/src/site/apt/1.7/parser_guide.apt
tika/site/src/site/apt/1.8/parser_guide.apt
tika/site/src/site/apt/1.9/parser_guide.apt

Modified: tika/site/src/site/apt/0.10/parser_guide.apt
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/0.10/parser_guide.apt?rev=1726164=1726163=1726164=diff
==
--- tika/site/src/site/apt/0.10/parser_guide.apt (original)
+++ tika/site/src/site/apt/0.10/parser_guide.apt Fri Jan 22 09:37:17 2016
@@ -1,8 +1,6 @@

Get Tika parsing up and running in 5 minutes

-  Arturo Beltran
-  

 
 ~~ Licensed to the Apache Software Foundation (ASF) under one or more
 ~~ contributor license agreements.  See the NOTICE file distributed with

Modified: tika/site/src/site/apt/0.7/parser_guide.apt
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/0.7/parser_guide.apt?rev=1726164=1726163=1726164=diff
==
--- tika/site/src/site/apt/0.7/parser_guide.apt (original)
+++ tika/site/src/site/apt/0.7/parser_guide.apt Fri Jan 22 09:37:17 2016
@@ -1,8 +1,6 @@

Get Tika parsing up and running in 5 minutes

-  Arturo Beltran
-  

 
 ~~ Licensed to the Apache Software Foundation (ASF) under one or more
 ~~ contributor license agreements.  See the NOTICE file distributed with

Modified: tika/site/src/site/apt/0.8/parser_guide.apt
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/0.8/parser_guide.apt?rev=1726164=1726163=1726164=diff
==
--- tika/site/src/site/apt/0.8/parser_guide.apt (original)
+++ tika/site/src/site/apt/0.8/parser_guide.apt Fri Jan 22 09:37:17 2016
@@ -1,8 +1,6 @@

Get Tika parsing up and running in 5 minutes

-  Arturo Beltran
-  

 
 ~~ Licensed to the Apache Software Foundation (ASF) under one or more
 ~~ contributor license agreements.  See the NOTICE file distributed with

Modified: tika/site/src/site/apt/0.9/parser_guide.apt
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/0.9/parser_guide.apt?rev=1726164=1726163=1726164=diff
==
--- tika/site/src/site/apt/0.9/parser_guide.apt (original)
+++ tika/site/src/site/apt/0.9/parser_guide.apt Fri Jan 22 09:37:17 2016
@@ -1,8 +1,6 @@

Get Tika parsing up and running in 5 minutes

-  Arturo Beltran
-  

 
 ~~ Licensed to the Apache Software Foundation (ASF) under one or more
 ~~ contributor license agreements.  See the NOTICE file distributed with

Modified: tika/site/src/site/apt/1.0/parser_guide.apt
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/1.0/parser_guide.apt?rev=1726164=1726163=1726164=diff
==
--- tika/site/src/site/apt/1.0/parser_guide.apt (original)
+++ tika/site/src/site/apt/1.0/parser_guide.apt Fri Jan 22 09:37:17 2016
@@ -1,8 +1,6

svn commit: r1726166 - in /tika/site/publish: ./ 0.10/ 0.7/ 0.8/ 0.9/ 1.0/ 1.1/ 1.10/ 1.11/ 1.2/ 1.3/ 1.4/ 1.5/ 1.6/ 1.7/ 1.8/ 1.9/

2016-01-22 Thread nick
Author: nick
Date: Fri Jan 22 09:45:05 2016
New Revision: 1726166

URL: http://svn.apache.org/viewvc?rev=1726166=rev
Log:
Republish the site for Git updates

Modified:
tika/site/publish/0.10/parser_guide.html
tika/site/publish/0.7/parser_guide.html
tika/site/publish/0.8/parser_guide.html
tika/site/publish/0.9/parser_guide.html
tika/site/publish/1.0/parser_guide.html
tika/site/publish/1.1/parser_guide.html
tika/site/publish/1.10/parser_guide.html
tika/site/publish/1.11/parser_guide.html
tika/site/publish/1.2/parser_guide.html
tika/site/publish/1.3/parser_guide.html
tika/site/publish/1.4/parser_guide.html
tika/site/publish/1.5/parser_guide.html
tika/site/publish/1.6/parser_guide.html
tika/site/publish/1.7/parser_guide.html
tika/site/publish/1.8/parser_guide.html
tika/site/publish/1.9/parser_guide.html
tika/site/publish/contribute.html

Modified: tika/site/publish/0.10/parser_guide.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/0.10/parser_guide.html?rev=1726166=1726165=1726166=diff
==
--- tika/site/publish/0.10/parser_guide.html (original)
+++ tika/site/publish/0.10/parser_guide.html Fri Jan 22 09:45:05 2016
@@ -99,7 +99,7 @@
 The Getting Started document describes 
how to build Apache Tika from sources and how to start using Tika in an 
application. Pay close attention and follow the instructions in the 
Getting and building the sources section.
 
 Add your MIME-Type
-You first need to modify http://svn.apache.org/repos/asf/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml;>tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
 in order to Tika can map the file extension with its MIME-Type. You should add 
something like this:
+You first need to modify https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml;hb=refs/heads/master;>tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
 in order to Tika can map the file extension with its MIME-Type. You should add 
something like this:
 
  mime-type type=application/hello
 glob pattern=*.hi/
@@ -178,7 +178,7 @@ public class HelloParser implements Pars
 
 List the new parser
 Finally, you should explicitly tell the AutoDetectParser to include your 
new parser. This step is only needed if you want to use the AutoDetectParser 
functionality. If you figure out the correct parser in a different way, it 
isn't needed. 
-List your new parser in: http://svn.apache.org/repos/asf/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser;>tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+List your new parser in: https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser;hb=refs/heads/master;>tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
   
   
 
@@ -352,7 +352,7 @@ public class HelloParser implements Pars
   
   
 
-  Copyright  2015
+  Copyright  2016
   http://www.apache.org/;>The Apache Software Foundation.
   Site powered by http://maven.apache.org/;>Apache Maven. 
   Search powered by

Modified: tika/site/publish/0.7/parser_guide.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/0.7/parser_guide.html?rev=1726166=1726165=1726166=diff
==
--- tika/site/publish/0.7/parser_guide.html (original)
+++ tika/site/publish/0.7/parser_guide.html Fri Jan 22 09:45:05 2016
@@ -99,7 +99,7 @@
 The Getting Started document describes 
how to build Apache Tika from sources and how to start using Tika in an 
application. Pay close attention and follow the instructions in the 
Getting and building the sources section.
 
 Add your MIME-Type
-You first need to modify http://svn.apache.org/repos/asf/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml;>tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
 in order to Tika can map the file extension with its MIME-Type. You should add 
something like this:
+You first need to modify https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml;hb=refs/heads/master;>tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
 in order to Tika can map the file extension with its MIME-Type. You should add 
something like this:
 
  mime-type type=application/hello
 glob pattern=*.hi/
@@ -178,7 +178,7 @@ public class HelloParser implements Pars
 
 List the new parser
 Finally, you should explicitly tell the AutoDetectParser to include your 
new parser. This step is only n

svn commit: r1726163 - in /tika/site/src/site/apt: 0.10/ 0.7/ 0.8/ 0.9/ 1.0/ 1.1/ 1.10/ 1.11/ 1.2/ 1.3/ 1.4/ 1.5/ 1.6/ 1.7/ 1.8/ 1.9/

2016-01-22 Thread nick
Author: nick
Date: Fri Jan 22 09:35:27 2016
New Revision: 1726163

URL: http://svn.apache.org/viewvc?rev=1726163=rev
Log:
Change SVN view URL to a Git one

Modified:
tika/site/src/site/apt/0.10/parser_guide.apt
tika/site/src/site/apt/0.7/parser_guide.apt
tika/site/src/site/apt/0.8/parser_guide.apt
tika/site/src/site/apt/0.9/parser_guide.apt
tika/site/src/site/apt/1.0/parser_guide.apt
tika/site/src/site/apt/1.1/parser_guide.apt
tika/site/src/site/apt/1.10/parser_guide.apt
tika/site/src/site/apt/1.11/parser_guide.apt
tika/site/src/site/apt/1.2/parser_guide.apt
tika/site/src/site/apt/1.3/parser_guide.apt
tika/site/src/site/apt/1.4/parser_guide.apt
tika/site/src/site/apt/1.5/parser_guide.apt
tika/site/src/site/apt/1.6/parser_guide.apt
tika/site/src/site/apt/1.7/parser_guide.apt
tika/site/src/site/apt/1.8/parser_guide.apt
tika/site/src/site/apt/1.9/parser_guide.apt

Modified: tika/site/src/site/apt/0.10/parser_guide.apt
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/0.10/parser_guide.apt?rev=1726163=1726162=1726163=diff
==
--- tika/site/src/site/apt/0.10/parser_guide.apt (original)
+++ tika/site/src/site/apt/0.10/parser_guide.apt Fri Jan 22 09:35:27 2016
@@ -35,7 +35,7 @@ Get Tika parsing up and running in 5 min
 
 * {Add your MIME-Type}
 
-   You first need to modify 
{{{http://svn.apache.org/repos/asf/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}}
+   You first need to modify 
{{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml;hb=refs/heads/master}tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}}
in order to Tika can map the file extension with its MIME-Type. You should 
add something like this:

 ---

Modified: tika/site/src/site/apt/0.7/parser_guide.apt
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/0.7/parser_guide.apt?rev=1726163=1726162=1726163=diff
==
--- tika/site/src/site/apt/0.7/parser_guide.apt (original)
+++ tika/site/src/site/apt/0.7/parser_guide.apt Fri Jan 22 09:35:27 2016
@@ -35,7 +35,7 @@ Get Tika parsing up and running in 5 min
 
 * {Add your MIME-Type}
 
-   You first need to modify 
{{{http://svn.apache.org/repos/asf/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}}
+   You first need to modify 
{{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml;hb=refs/heads/master}tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}}
in order to Tika can map the file extension with its MIME-Type. You should 
add something like this:

 ---

Modified: tika/site/src/site/apt/0.8/parser_guide.apt
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/0.8/parser_guide.apt?rev=1726163=1726162=1726163=diff
==
--- tika/site/src/site/apt/0.8/parser_guide.apt (original)
+++ tika/site/src/site/apt/0.8/parser_guide.apt Fri Jan 22 09:35:27 2016
@@ -35,7 +35,7 @@ Get Tika parsing up and running in 5 min
 
 * {Add your MIME-Type}
 
-   You first need to modify 
{{{http://svn.apache.org/repos/asf/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}}
+   You first need to modify 
{{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml;hb=refs/heads/master}tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}}
in order to Tika can map the file extension with its MIME-Type. You should 
add something like this:

 ---

Modified: tika/site/src/site/apt/0.9/parser_guide.apt
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/0.9/parser_guide.apt?rev=1726163=1726162=1726163=diff
==
--- tika/site/src/site/apt/0.9/parser_guide.apt (original)
+++ tika/site/src/site/apt/0.9/parser_guide.apt Fri Jan 22 09:35:27 2016
@@ -35,7 +35,7 @@ Get Tika parsing up and running in 5 min
 
 * {Add your MIME-Type}
 
-   You first need to modify 
{{{http://svn.apache.org/repos/asf/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}}
+   You first need to modify 
{{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml;hb=refs/heads/master}tika-core/src/main/resources/org/apache

svn commit: r1726165 - in /tika/site/src/site/apt: 0.10/ 0.7/ 0.8/ 0.9/ 1.0/ 1.1/ 1.10/ 1.11/ 1.2/ 1.3/ 1.4/ 1.5/ 1.6/ 1.7/ 1.8/ 1.9/

2016-01-22 Thread nick
Author: nick
Date: Fri Jan 22 09:40:16 2016
New Revision: 1726165

URL: http://svn.apache.org/viewvc?rev=1726165=rev
Log:
Update another SVN view link to a Git view link

Modified:
tika/site/src/site/apt/0.10/parser_guide.apt
tika/site/src/site/apt/0.7/parser_guide.apt
tika/site/src/site/apt/0.8/parser_guide.apt
tika/site/src/site/apt/0.9/parser_guide.apt
tika/site/src/site/apt/1.0/parser_guide.apt
tika/site/src/site/apt/1.1/parser_guide.apt
tika/site/src/site/apt/1.10/parser_guide.apt
tika/site/src/site/apt/1.11/parser_guide.apt
tika/site/src/site/apt/1.2/parser_guide.apt
tika/site/src/site/apt/1.3/parser_guide.apt
tika/site/src/site/apt/1.4/parser_guide.apt
tika/site/src/site/apt/1.5/parser_guide.apt
tika/site/src/site/apt/1.6/parser_guide.apt
tika/site/src/site/apt/1.7/parser_guide.apt
tika/site/src/site/apt/1.8/parser_guide.apt
tika/site/src/site/apt/1.9/parser_guide.apt

Modified: tika/site/src/site/apt/0.10/parser_guide.apt
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/0.10/parser_guide.apt?rev=1726165=1726164=1726165=diff
==
--- tika/site/src/site/apt/0.10/parser_guide.apt (original)
+++ tika/site/src/site/apt/0.10/parser_guide.apt Fri Jan 22 09:40:16 2016
@@ -128,6 +128,6 @@ public class HelloParser implements Pars
If you figure out the correct parser in a different way, it isn't needed. 

List your new parser in:
-
{{{http://svn.apache.org/repos/asf/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}}
+
{{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser;hb=refs/heads/master}tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}}

 

Modified: tika/site/src/site/apt/0.7/parser_guide.apt
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/0.7/parser_guide.apt?rev=1726165=1726164=1726165=diff
==
--- tika/site/src/site/apt/0.7/parser_guide.apt (original)
+++ tika/site/src/site/apt/0.7/parser_guide.apt Fri Jan 22 09:40:16 2016
@@ -128,6 +128,6 @@ public class HelloParser implements Pars
If you figure out the correct parser in a different way, it isn't needed. 

List your new parser in:
-
{{{http://svn.apache.org/repos/asf/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}}
+
{{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser;hb=refs/heads/master}tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}}

 

Modified: tika/site/src/site/apt/0.8/parser_guide.apt
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/0.8/parser_guide.apt?rev=1726165=1726164=1726165=diff
==
--- tika/site/src/site/apt/0.8/parser_guide.apt (original)
+++ tika/site/src/site/apt/0.8/parser_guide.apt Fri Jan 22 09:40:16 2016
@@ -128,6 +128,6 @@ public class HelloParser implements Pars
If you figure out the correct parser in a different way, it isn't needed. 

List your new parser in:
-
{{{http://svn.apache.org/repos/asf/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}}
+
{{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser;hb=refs/heads/master}tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}}

 

Modified: tika/site/src/site/apt/0.9/parser_guide.apt
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/0.9/parser_guide.apt?rev=1726165=1726164=1726165=diff
==
--- tika/site/src/site/apt/0.9/parser_guide.apt (original)
+++ tika/site/src/site/apt/0.9/parser_guide.apt Fri Jan 22 09:40:16 2016
@@ -128,6 +128,6 @@ public class HelloParser implements Pars
If you figure out the correct parser in a different way, it isn't needed. 

List your new parser in:
-
{{{http://svn.apache.org/repos/asf/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}}
+
{{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser;hb=refs/heads/master}tika-parsers/src/main/resources

svn commit: r1726153 - /tika/site/src/site/apt/contribute.apt.vm

2016-01-21 Thread nick
Author: nick
Date: Fri Jan 22 07:43:29 2016
New Revision: 1726153

URL: http://svn.apache.org/viewvc?rev=1726153=rev
Log:
Update the contributing guide for Git - other SVN references still remain

Modified:
tika/site/src/site/apt/contribute.apt.vm

Modified: tika/site/src/site/apt/contribute.apt.vm
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/contribute.apt.vm?rev=1726153=1726152=1726153=diff
==
--- tika/site/src/site/apt/contribute.apt.vm (original)
+++ tika/site/src/site/apt/contribute.apt.vm Fri Jan 22 07:43:29 2016
@@ -30,14 +30,13 @@ Source Code
To download the source code for the latest release of Apache Tika, please
see the {{{./download.html}Download page}}.
 
-   The master copy of the Apache Tika source code is held in SVN. You can
-   checkout the code from 
-   
{{{https://svn.apache.org/repos/asf/tika/trunk}https://svn.apache.org/repos/asf/tika/trunk}}
+   The master copy of the Apache Tika source code is held in GIT. You can
+   clone (checkout) the code from 
+   
{{{https://git-wip-us.apache.org/repos/asf/tika.git}https://git-wip-us.apache.org/repos/asf/tika.git}}
and you can browse it online through
-   {{{http://svn.apache.org/viewvc/tika/trunk/}Viewvc}}
+   {{{https://git-wip-us.apache.org/repos/asf?p=tika.git}Git web interface}}
 
-   For those who prefer working with Git, a read only mirror is available
-   from {{{http://git.apache.org/}git.apache.org}}. We also maintain a 
+   For those who prefer working on GitHub, we also maintain a 
{{{https://github.com/apache/tika/}GitHub mirror}}, which you are welcome
to fork from and open pull requests to.
 
@@ -76,13 +75,9 @@ Submitting Enhancements and Fixes
/ new code. The JIRA can be used for discussions on the code, and provides
a single identifier for the change.
 
-   SVN - For users of SVN, you can use <<>> to generate a patch
-   file of your changes, which can then be attached to the issue. Note that
-   a SVN diff won't normally include new or binary files, so these will need
-   to be attached separately.
-
-   Git - Git users can run <<>> to generate an SVN
-   compatible patch which can then be attached to an issue.
+   Git - Git users can run <<>> to generate a patch 
+   of changed and new files, including binaries, which can then be attached 
+   to an issue.
 
Github Pulls - If you are working from our 
{{{https://github.com/apache/tika/}GitHub mirror}}, it is possible to




svn commit: r1723581 - /tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

2016-01-07 Thread nick
Author: nick
Date: Thu Jan  7 15:53:47 2016
New Revision: 1723581

URL: http://svn.apache.org/viewvc?rev=1723581=rev
Log:
Try to make the common parts clearer for the DER-encoded PKCS7 signature 
(length comes between 0x308. and the pkcs7 object)

Modified:

tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

Modified: 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1723581=1723580=1723581=diff
==
--- 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
(original)
+++ 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
Thu Jan  7 15:53:47 2016
@@ -524,15 +524,29 @@
   
 
 
+  
   
-  
-  
-  
-  
+  
+  
+ 
+  
+  
+ 
+  
+  
+ 
+  
+  
+ 
+  
+  
+ 
+  
 
   
 




svn commit: r1721390 - /tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

2015-12-22 Thread nick
Author: nick
Date: Tue Dec 22 13:01:38 2015
New Revision: 1721390

URL: http://svn.apache.org/viewvc?rev=1721390=rev
Log:
TIKA-1817 Mime magic for AutoCAD DXF in Ascii and Binary, plus the related DXB

Modified:

tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

Modified: 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1721390=1721389=1721390=diff
==
--- 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
(original)
+++ 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
Tue Dec 22 13:01:38 2015
@@ -4795,12 +4795,41 @@
 
   
 
+  
+DXB
+<_comment>AutoCAD DXF simplified Binary
+http://en.wikipedia.org/wiki/AutoCAD_DXF
+
+  
+
+
+  
+
   
 DXF
 <_comment>AutoCAD DXF
 http://en.wikipedia.org/wiki/AutoCAD_DXF
+
 
   
+  
+
+<_comment>AutoCAD DXF in Binary form
+
+  
+
+  
+  
+
+<_comment>AutoCAD DXF in ASCII Text form
+
+  
+  
+
+  
+
+  
+
   
 
   




svn commit: r1717559 - /tika/branches/2.x/tika-core/pom.xml

2015-12-01 Thread nick
Author: nick
Date: Wed Dec  2 00:33:37 2015
New Revision: 1717559

URL: http://svn.apache.org/viewvc?rev=1717559=rev
Log:
Change what CLIRR checks against - we expect breakages vs Tika Core 1.0, that 
is why it is 2.0!

Modified:
tika/branches/2.x/tika-core/pom.xml

Modified: tika/branches/2.x/tika-core/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-core/pom.xml?rev=1717559=1717558=1717559=diff
==
--- tika/branches/2.x/tika-core/pom.xml (original)
+++ tika/branches/2.x/tika-core/pom.xml Wed Dec  2 00:33:37 2015
@@ -112,19 +112,15 @@
 
 
   
-org/apache/tika/config/TikaActivator
-org/apache/tika/metadata/Property$PropertyType
-org/apache/tika/metadata/Property$ValueType
-org/apache/tika/metadata/DublinCore
-org/apache/tika/metadata/Metadata
-org/apache/tika/metadata/MSOffice
-org/apache/tika/parser/EmptyParser
+org/apache/tika/config/LoadErrorHandler
   
   
 
   org.apache.tika
   tika-core
-  1.0
+  
+  
+  1.11
   jar
 
   




svn commit: r1717560 - in /tika/branches/2.x/tika-core/src: main/java/org/apache/tika/config/ test/java/org/apache/tika/config/

2015-12-01 Thread nick
Author: nick
Date: Wed Dec  2 00:33:41 2015
New Revision: 1717560

URL: http://svn.apache.org/viewvc?rev=1717560=rev
Log:
TIKA-1805 Notify via LoadErrorHandler if DefaultParser or DefaultDetector could 
not find any implementations of their service classes

Modified:

tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java

tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java

tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java

tika/branches/2.x/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java

Modified: 
tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java?rev=1717560=1717559=1717560=diff
==
--- 
tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java
 (original)
+++ 
tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java
 Wed Dec  2 00:33:41 2015
@@ -39,6 +39,16 @@ public interface LoadErrorHandler {
  * @param throwable the encountered problem
  */
 void handleLoadError(String classname, Throwable throwable);
+
+/**
+ * Handles the case of no occurrences of the specified service interface
+ * being found. The implementation can log or otherwise process
+ * the given error information. If the method returns normally, then
+ * the service loader simply returns an empty list to the caller.
+ *
+ * @param interfacename name of the service interface with no occurrences
+ */
+void handleNoOccurrences(String interfacename);
 
 /**
  * Strategy that simply ignores all problems.
@@ -46,6 +56,8 @@ public interface LoadErrorHandler {
 LoadErrorHandler IGNORE = new LoadErrorHandler() {
 public void handleLoadError(String classname, Throwable throwable) {
 }
+public void handleNoOccurrences(String interfacename) {
+}
 @Override
 public String toString() {
 return "IGNORE";
@@ -61,6 +73,10 @@ public interface LoadErrorHandler {
 Logger.getLogger(classname).log(
 Level.WARNING, "Unable to load " + classname, throwable);
 }
+public void handleNoOccurrences(String interfacename) {
+Logger.getLogger(interfacename).log(
+Level.WARNING, "No occurrences found of " + interfacename);
+}
 @Override
 public String toString() {
 return "WARN";
@@ -76,6 +92,9 @@ public interface LoadErrorHandler {
 public void handleLoadError(String classname, Throwable throwable) {
 throw new RuntimeException("Unable to load " + classname, 
throwable);
 }
+public void handleNoOccurrences(String interfacename) {
+throw new RuntimeException("No occurrences found of " + 
interfacename);
+}
 @Override
 public String toString() {
 return "THROW";

Modified: 
tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java?rev=1717560=1717559=1717560=diff
==
--- 
tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
 (original)
+++ 
tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
 Wed Dec  2 00:33:41 2015
@@ -334,6 +334,9 @@ public class ServiceLoader {
 }
 }
 }
+if (providers.isEmpty()) {
+handler.handleNoOccurrences(iface.getName());
+}
 return providers;
 }
 

Modified: 
tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1717560=1717559=1717560=diff
==
--- 
tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
 (original)
+++ 
tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
 Wed Dec  2 00:33:41 2015
@@ -162,8 +162,8 @@ public class TikaConfig {
 ExecutorServiceXmlLoader executorLoader = new 
ExecutorServiceXmlLoader();
 
 this.mimeTypes = typesFromDomElement(element);
-this.detector = detectorLoader.loadOverall(element, mimeTypes, loader);
 this.parser = parserLoader.loadOverall(element, mimeTypes, loader);
+this.detector = detectorLoader.loadOverall(element, mimeT

svn commit: r1717557 - in /tika/branches/2.x: CHANGES.txt tika-core/src/main/java/org/apache/tika/config/TikaConfig.java

2015-12-01 Thread nick
Author: nick
Date: Tue Dec  1 23:58:32 2015
New Revision: 1717557

URL: http://svn.apache.org/viewvc?rev=1717557=rev
Log:
Change the default LoadErrorHandler for Tika 2.x to be warn (TIKA-1805)

Modified:
tika/branches/2.x/CHANGES.txt

tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java

Modified: tika/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/CHANGES.txt?rev=1717557=1717556=1717557=diff
==
--- tika/branches/2.x/CHANGES.txt (original)
+++ tika/branches/2.x/CHANGES.txt Tue Dec  1 23:58:32 2015
@@ -1,3 +1,11 @@
+Release 2.0 - Future Development
+
+  * The default LoadErrorHandler is now WARN, to alert you to missing
+parser classes and their dependencies. To keep the old behaviour,
+set your LoadErrorHandler to IGNORE. (TIKA-1805)
+
+  * (Something about more specific parser bundles, plus an overall one)
+
 Release 1.12 - Current Development
 
   * A parser to compute motion properties in Videos, e.g., 

Modified: 
tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1717557=1717556=1717557=diff
==
--- 
tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
 (original)
+++ 
tika/branches/2.x/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
 Tue Dec  1 23:58:32 2015
@@ -447,10 +447,10 @@ public class TikaConfig {
 ServiceLoader serviceLoader;
 if (serviceLoaderElement != null) {
 boolean dynamic = 
Boolean.parseBoolean(serviceLoaderElement.getAttribute("dynamic"));
-LoadErrorHandler loadErrorHandler = LoadErrorHandler.IGNORE;
+LoadErrorHandler loadErrorHandler = LoadErrorHandler.WARN;
 String loadErrorHandleConfig = 
serviceLoaderElement.getAttribute("loadErrorHandler");
-
if(LoadErrorHandler.WARN.toString().equalsIgnoreCase(loadErrorHandleConfig)) {
-loadErrorHandler = LoadErrorHandler.WARN;
+
if(LoadErrorHandler.IGNORE.toString().equalsIgnoreCase(loadErrorHandleConfig)) {
+loadErrorHandler = LoadErrorHandler.IGNORE;
 } else 
if(LoadErrorHandler.THROW.toString().equalsIgnoreCase(loadErrorHandleConfig)) {
 loadErrorHandler = LoadErrorHandler.THROW;
 }




svn commit: r1714493 - /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java

2015-11-15 Thread nick
Author: nick
Date: Sun Nov 15 19:56:25 2015
New Revision: 1714493

URL: http://svn.apache.org/viewvc?rev=1714493=rev
Log:
Fix inconsistent whitespace

Modified:

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java?rev=1714493=1714492=1714493=diff
==
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
 Sun Nov 15 19:56:25 2015
@@ -46,139 +46,139 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 public class GeoParser extends AbstractParser {
-   private static final long serialVersionUID = -2241391757440215491L;
-private static final Logger LOG = 
Logger.getLogger(GeoParser.class.getName());
-   private static final MediaType MEDIA_TYPE = 
-   MediaType.application("geotopic");
-   private static final Set SUPPORTED_TYPES = 
-   Collections.singleton(MEDIA_TYPE);
-   private GeoParserConfig config = new GeoParserConfig();
-
-   private boolean initialized;
-   private URL modelUrl;
-   private NameEntityExtractor extractor;
-   private boolean available;
-
-   @Override
-   public Set getSupportedTypes(ParseContext parseContext) {
-   return SUPPORTED_TYPES;
-   }
-
-   /**
-* Initializes this parser
-* @param modelUrl the URL to NER model
-*/
-   public void initialize(URL modelUrl) {
-
-   if (this.modelUrl != null && this.modelUrl.equals(modelUrl)) {
-   //previously initialized for the same URL
-   return;
-   }
-   this.modelUrl = modelUrl;
-   //if NER model is available and lucene-geo-gazetteer is 
available
-   this.available = modelUrl != null &&
-   ExternalParser.check(new String[] { 
"lucene-geo-gazetteer", "--help" }, -1);
-   if (this.available) {
-   try {
-   this.extractor = new 
NameEntityExtractor(modelUrl);
-   } catch (Exception e) {
-   e.printStackTrace();
-   this.available = false;
-   }
-   }
-   initialized = true;
-
-   }
-
-   @Override
-   public void parse(InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext 
context) throws IOException,
-   SAXException, TikaException {
-
-   /*configure this parser by ParseContext 
Object-*/
-
-   this.config = context.get(GeoParserConfig.class, config);
-   initialize(this.config.getNerModelUrl());
-   if (!isAvailable()) {
-   return;
-   }
-
-   /*get locationNameEntities and best nameEntity 
for the input stream-*/
-   extractor.getAllNameEntitiesfromInput(stream);
-   extractor.getBestNameEntity();
-   ArrayList locationNameEntities = 
extractor.locationNameEntities;
-   String bestner = extractor.bestNameEntity;
-
-   /*resolve geonames for each ner, store 
results in a hashmap-*/
-   HashMap<String, ArrayList> resolvedGeonames = 
searchGeoNames(locationNameEntities);
-
-   /*store locationNameEntities and their geonames 
in a geotag, each input has one geotag-*/
-   GeoTag geotag = new GeoTag();
-   geotag.toGeoTag(resolvedGeonames, bestner);
-
-   /* add resolved entities in metadata */
-
-   metadata.add("Geographic_NAME", geotag.Geographic_NAME);
-   metadata.add("Geographic_LONGITUDE", 
geotag.Geographic_LONGTITUDE);
-   metadata.add("Geographic_LATITUDE", geotag.Geographic_LATITUDE);
-   for (int i = 0; i < geotag.alternatives.size(); ++i) {
-   GeoTag alter = (GeoTag) geotag.alternatives.get(i);
-   metadata.add("Optional_NAME" + (i + 1), 
alter.Geographic_NAME);
-   metadata.add("Optional_LONGITUDE" + (i + 1),
-   alter.Geographic_LONGTITUDE);
-   meta

svn commit: r1714494 - /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java

2015-11-15 Thread nick
Author: nick
Date: Sun Nov 15 20:01:18 2015
New Revision: 1714494

URL: http://svn.apache.org/viewvc?rev=1714494=rev
Log:
TIKA-1791 Comments and logging

Modified:

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java?rev=1714494=1714493=1714494=diff
==
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
 Sun Nov 15 20:01:18 2015
@@ -71,24 +71,25 @@ public class GeoParser extends AbstractP
  */
 public void initialize(URL modelUrl) {
 if (this.modelUrl != null && this.modelUrl.equals(modelUrl)) {
-// Previously initialized for the same URL
+// Previously initialized for the same URL, no initialization 
needed
 return;
 }
 
 this.modelUrl = modelUrl;
-//if NER model is available and lucene-geo-gazetteer is available
-this.available = modelUrl != null &&
-ExternalParser.check(new String[] { "lucene-geo-gazetteer", 
"--help" }, -1);
+
+// Check if the NER model is available, and if the
+//  lucene-geo-gazetteer is available
+this.available = modelUrl != null && ExternalParser.check(
+new String[] { "lucene-geo-gazetteer", "--help" }, -1);
 if (this.available) {
 try {
 this.extractor = new NameEntityExtractor(modelUrl);
 } catch (Exception e) {
-e.printStackTrace();
+LOG.warning("Named Entity Extractor setup failed: " + e);
 this.available = false;
 }
 }
 initialized = true;
-
 }
 
 @Override
@@ -126,9 +127,9 @@ public class GeoParser extends AbstractP
 GeoTag alter = (GeoTag) geotag.alternatives.get(i);
 metadata.add("Optional_NAME" + (i + 1), alter.Geographic_NAME);
 metadata.add("Optional_LONGITUDE" + (i + 1),
-alter.Geographic_LONGTITUDE);
+ alter.Geographic_LONGTITUDE);
 metadata.add("Optional_LATITUDE" + (i + 1),
-alter.Geographic_LATITUDE);
+ alter.Geographic_LATITUDE);
 }
 }
 
@@ -149,8 +150,7 @@ public class GeoParser extends AbstractP
 exec.setWatchdog(watchdog);
 PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream);
 exec.setStreamHandler(streamHandler);
-int exitValue = exec.execute(cmdLine,
-EnvironmentUtils.getProcEnvironment());
+int exitValue = exec.execute(cmdLine, 
EnvironmentUtils.getProcEnvironment());
 String outputJson = outputStream.toString("UTF-8");
 JSONArray json = (JSONArray) JSONValue.parse(outputJson);
 
@@ -172,7 +172,6 @@ public class GeoParser extends AbstractP
 }
 
 return returnHash;
-
 }
 
 public boolean isAvailable() {




svn commit: r1714495 - in /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic: GeoParserConfig.java NameEntityExtractor.java

2015-11-15 Thread nick
Author: nick
Date: Sun Nov 15 20:01:22 2015
New Revision: 1714495

URL: http://svn.apache.org/viewvc?rev=1714495=rev
Log:
Fix inconsistent whitespace

Modified:

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java?rev=1714495=1714494=1714495=diff
==
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
 Sun Nov 15 20:01:22 2015
@@ -23,34 +23,31 @@ import java.net.MalformedURLException;
 import java.net.URL;
 
 public class GeoParserConfig implements Serializable {
+private static final long serialVersionUID = -3167692634278575818L;
+private URL nerModelUrl = null;
 
-   private static final long serialVersionUID = 2L;
-   private URL nerModelUrl = null;
-
-   public GeoParserConfig() {
-   this.nerModelUrl = 
GeoParserConfig.class.getResource("en-ner-location.bin");
-   }
-
-   public void setNERModelPath(String path) {
-   if (path == null)
-   return;
-   File file = new File(path);
-   if (file.isDirectory() || !file.exists()) {
-   return;
-   }
-   try {
-   this.nerModelUrl = file.toURI().toURL();
-   } catch (MalformedURLException e) {
-   throw new RuntimeException(e);
-   }
-   }
-
-   public void setNerModelUrl(URL url) {
-   this.nerModelUrl = url;
-   }
-
-   public URL getNerModelUrl() {
-   return nerModelUrl;
-   }
-
+public GeoParserConfig() {
+this.nerModelUrl = 
GeoParserConfig.class.getResource("en-ner-location.bin");
+}
+
+public void setNERModelPath(String path) {
+if (path == null)
+return;
+File file = new File(path);
+if (file.isDirectory() || !file.exists()) {
+return;
+}
+try {
+this.nerModelUrl = file.toURI().toURL();
+} catch (MalformedURLException e) {
+throw new RuntimeException(e);
+}
+}
+
+public void setNerModelUrl(URL url) {
+this.nerModelUrl = url;
+}
+public URL getNerModelUrl() {
+return nerModelUrl;
+}
 }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java?rev=1714495=1714494=1714495=diff
==
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
 Sun Nov 15 20:01:22 2015
@@ -37,93 +37,88 @@ import org.apache.commons.io.IOUtils;
 import static java.nio.charset.StandardCharsets.UTF_8;
 
 public class NameEntityExtractor {
-
-   ArrayList locationNameEntities;
-   String bestNameEntity;
-   private HashMap<String, Integer> tf;
-   private final NameFinderME nameFinder;
-
-   public NameEntityExtractor(URL modelUrl) throws IOException {
-   this.locationNameEntities = new ArrayList();
-   this.bestNameEntity = null;
-   TokenNameFinderModel model = new TokenNameFinderModel(modelUrl);
-   this.nameFinder = new NameFinderME(model);
-   this.tf = new HashMap<String, Integer>();
-   }
-
-   /*
-* Use OpenNLP to extract location names that's appearing in the steam.
-* OpenNLP's default Name Finder accuracy is not very good, please 
refer to
-* its documentation.
-* 
-* @param stream stream that passed from this.parse()
-*/
-
-   public void getAllNameEntitiesfromInput(InputStream stream)
-   throws IOException {
-
-
-   String[] in = IOUtils.toString(stream, UTF_8).split(" ");
-   Span nameE[];
-   //name finder is not thread safe 
https://opennlp.apache.org/documentation/1.5.2-incubating/manual/opennlp.html#tools.namefind
-   synchronized (nameFinder) {
-   nameE = nameFinder.find(in);
-   //the same name finder is reused, so clear adaptive data
- 

svn commit: r1714496 - /tika/trunk/CHANGES.txt

2015-11-15 Thread nick
Author: nick
Date: Sun Nov 15 20:03:00 2015
New Revision: 1714496

URL: http://svn.apache.org/viewvc?rev=1714496=rev
Log:
Changelog update

Modified:
tika/trunk/CHANGES.txt

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1714496=1714495=1714496=diff
==
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sun Nov 15 20:03:00 2015
@@ -8,6 +8,8 @@ Release 1.12 - Current Development
   * Tika Facade parse methods for Path and File added which take a
 Metadata object, to mirror the existing InputStream one (GitHub-60)
 
+  * GeoParser fix for loading the NER model from a jar file (TIKA-1791)
+
 
 Release 1.11 - 10/18/2015
 




svn commit: r1714341 - in /tika/site: publish/1.11/gettingstarted.html src/site/apt/1.11/gettingstarted.apt

2015-11-14 Thread nick
Author: nick
Date: Sat Nov 14 16:23:57 2015
New Revision: 1714341

URL: http://svn.apache.org/viewvc?rev=1714341=rev
Log:
Add Gradle and Ivy instructions

Modified:
tika/site/publish/1.11/gettingstarted.html
tika/site/src/site/apt/1.11/gettingstarted.apt

Modified: tika/site/publish/1.11/gettingstarted.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.11/gettingstarted.html?rev=1714341=1714340=1714341=diff
==
--- tika/site/publish/1.11/gettingstarted.html (original)
+++ tika/site/publish/1.11/gettingstarted.html Sat Nov 14 16:23:57 2015
@@ -111,33 +111,45 @@
  Tika bundle. An OSGi bundle that combines tika-parsers with non-OSGified 
parser libraries to make them easy to deploy in an OSGi 
environment.
 
 Using Tika as a Maven 
dependency
-The core library, tika-core, contains the key interfaces and classes of 
Tika and can be used by itself if you don't need the full set of parsers from 
the tika-parsers component. The tika-core dependency looks like this:
+The core library,  tika-core , contains the key interfaces and 
classes of Tika and can be used by itself if you don't need the full set of 
parsers from the  tika-parsers  component. The tika-core dependency 
looks like this:
 
   dependency
 groupIdorg.apache.tika/groupId
 artifactIdtika-core/artifactId
-version.../version
+version1.11/version
   /dependency
-If you want to use Tika to parse documents (instead of simply detecting 
document types, etc.), you'll want to depend on tika-parsers instead: 
+If you want to use Tika to parse documents (instead of simply detecting 
document types, etc.), you'll want to depend on  tika-parsers  
instead: 
 
   dependency
 groupIdorg.apache.tika/groupId
 artifactIdtika-parsers/artifactId
-version.../version
+version1.11/version
   /dependency
 Note that adding this dependency will introduce a number of transitive 
dependencies to your project, including one on tika-core. You need to make sure 
that these dependencies won't conflict with your existing project dependencies. 
You can use the following command in the tika-parsers directory to get a full 
listing of all the dependencies.
 
 $ mvn dependency:tree | grep :compile
 
+Using Tika in a 
Gradle-built project
+To add a dependency on Apache Tika to your Gradle built project, including 
the full set of parsers, you should depend on the  tika-parsers  
artifact:
+
+dependencies {
+runtime 'org.apache.tika:tika-parsers:1.11'
+}
+
 Using Tika in an Ant 
project
-Unless you use a dependency manager tool like http://ant.apache.org/ivy/;>Apache Ivy, the easiest way to use Tika 
is to include either the tika-core or the tika-app jar in your classpath, 
depending on whether you want just the core functionality or also all the 
parser implementations.
+If you are using http://ant.apache.org/ivy/;>Apache Ivy as your dependency manager 
tool with Ant, then to include Tika with the full set of parsers, you should 
depend on the  tika-parsers  artifact like this:
+
+dependencies
+dependency org=org.apache.tika 
name=tika-parsers rev=1.11/
+/dependencies
+Otherwise, probably the easiest way to use Tika is to include the full  
tika-app  jar on your classpath. For just core functionality, you can add 
the  tika-core  jar, but be aware that the full set of parsers have a 
large number of dependencies which must be included which is very fiddly to do 
by hand with Ant! To include Tika in your Ant project, you should do something 
like:
 
 classpath
   ... !-- your other classpath entries --
 
-  !-- either: --
+  !-- either: Tika Core only, no parsers --
   pathelement 
location=path/to/tika-core-${tika.version}.jar/
-  !-- or: --
+  !-- or: Tika with all Parsers--
   pathelement 
location=path/to/tika-app-${tika.version}.jar/
 
 /classpath

Modified: tika/site/src/site/apt/1.11/gettingstarted.apt
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/1.11/gettingstarted.apt?rev=1714341=1714340=1714341=diff
==
--- tika/site/src/site/apt/1.11/gettingstarted.apt (original)
+++ tika/site/src/site/apt/1.11/gettingstarted.apt Sat Nov 14 16:23:57 2015
@@ -71,26 +71,27 @@ Build artifacts
 
 Using Tika as a Maven dependency
 
- The core library, tika-core, contains the key interfaces and classes of Tika
- and can be used by itself if you don't need the full set of parsers from
- the tika-parsers component. The tika-core dependency looks like this:
+ The core library, <<< tika-core >>>, contains the key interfaces and classes
+ of Tika and can be used by itself if you don't need the full set of parsers 
+ from the <<< tika-parsers >>> component. The tika-core dependency looks like 
+ this:
 
 ---
   
 org.apache.tika
 tika-core
-...
+1.11
   
 ---
 
  If you want to use Tika to parse documents (instead 

svn commit: r1714361 - /tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

2015-11-14 Thread nick
Author: nick
Date: Sat Nov 14 20:25:59 2015
New Revision: 1714361

URL: http://svn.apache.org/viewvc?rev=1714361=rev
Log:
TIKA-1793 Add rfc822 email detection for common thunderbird message first 
headers

Modified:

tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

Modified: 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1714361=1714360=1714361=diff
==
--- 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
(original)
+++ 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
Sat Nov 14 20:25:59 2015
@@ -5157,6 +5157,8 @@
   
 
   
+  
+  
   
   
   
@@ -5167,6 +5169,7 @@
   
   
   
+  
   
   
 




svn commit: r1713677 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/ tika-parsers/src/main/java/org/apache/tika/parser/pkg/ tika-parsers/src/test/java/org/apache/tika/detect/ tika

2015-11-10 Thread nick
Author: nick
Date: Tue Nov 10 16:18:45 2015
New Revision: 1713677

URL: http://svn.apache.org/viewvc?rev=1713677=rev
Log:
TIKA-1792 ASiC E and S mimetypes, detection and tests. Files and mimetype from 
Roberto Benedetti

Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testASiCE.asice   
(with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testASiCS.asics   
(with props)
Modified:

tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java

Modified: 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1713677=1713676=1713677=diff
==
--- 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
(original)
+++ 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
Tue Nov 10 16:18:45 2015
@@ -992,6 +992,33 @@
 
 
   
+
+  
+ASiC-E
+<_comment>Extended Associated Signature Container
+
+
+
+  
+
+  
+
+
+  
+
+  
+ASiC-S
+<_comment>Simple Associated Signature Container
+
+
+
+  
+
+  
+
+
+  
+
   
   
   
@@ -3834,6 +3861,7 @@
   
   
 
+
 
   
 

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java?rev=1713677=1713676=1713677=diff
==
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
 Tue Nov 10 16:18:45 2015
@@ -180,9 +180,9 @@ public class ZipContainerDetector implem
 }
 
 /**
- * OpenDocument files, along with EPub files, have a mimetype
- *  entry in the root of their Zip file. This entry contains the
- *  mimetype of the overall file, stored as a single string.  
+ * OpenDocument files, along with EPub files and ASiC ones, have a 
+ *  mimetype entry in the root of their Zip file. This entry contains
+ *  the mimetype of the overall file, stored as a single string.  
  */
 private static MediaType detectOpenDocument(ZipFile zip) {
 try {

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1713677=1713676=1713677=diff
==
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
 Tue Nov 10 16:18:45 2015
@@ -324,7 +324,15 @@ public class TestContainerAwareDetector
 public void testDetectIPA() throws Exception {
 assertTypeByNameAndData("testIPA.ipa", "application/x-itunes-ipa");
 assertTypeByData("testIPA.ipa", "application/x-itunes-ipa");
- }
+}
+
+@Test
+public void testASiC() throws Exception {
+assertTypeByData("testASiCE.asice", "application/vnd.etsi.asic-e+zip");
+assertTypeByData("testASiCS.asics", "application/vnd.etsi.asic-s+zip");
+assertTypeByNameAndData("testASiCE.asice", 
"application/vnd.etsi.asic-e+zip");
+assertTypeByNameAndData("testASiCS.asics", 
"application/vnd.etsi.asic-s+zip");
+}
  
 @Test
 public void testDetectZip() throws Exception {

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testASiCE.asice
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testASiCE.asice?rev=1713677=auto
==
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testASiCE.asice
--
svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testASiCS.asics
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testASiCS.asics?rev=1713677=auto
=

svn commit: r1713697 - /tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

2015-11-10 Thread nick
Author: nick
Date: Tue Nov 10 16:58:04 2015
New Revision: 1713697

URL: http://svn.apache.org/viewvc?rev=1713697=rev
Log:
Tweak ASiC comment and priority based on feedback from the spec

Modified:

tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

Modified: 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1713697=1713696=1713697=diff
==
--- 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
(original)
+++ 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
Tue Nov 10 16:58:04 2015
@@ -997,8 +997,8 @@
 ASiC-E
 <_comment>Extended Associated Signature Container
 
-
-
+
+
   
 
   
@@ -1010,8 +1010,8 @@
 ASiC-S
 <_comment>Simple Associated Signature Container
 
-
-
+
+
   
 
   




svn commit: r1711162 - in /tika/trunk: CHANGES.txt tika-core/src/main/java/org/apache/tika/Tika.java

2015-10-28 Thread nick
Author: nick
Date: Wed Oct 28 23:21:41 2015
New Revision: 1711162

URL: http://svn.apache.org/viewvc?rev=1711162=rev
Log:
Add Tika Facade parse methods for Path and File which take a Metadata object, 
to mirror the existing InputStream one. This closes #60 from GitHub

Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1711162=1711161=1711162=diff
==
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Wed Oct 28 23:21:41 2015
@@ -5,6 +5,9 @@ Release 1.12 - Current Development
 
   * Fix regression with spacing in PPT via Andreas Beeker (TIKA-1777).
 
+  * Tika Facade parse methods for Path and File added which take a
+Metadata object, to mirror the existing InputStream one (GitHub-60)
+
 
 Release 1.11 - 10/18/2015
 

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java?rev=1711162=1711161=1711162=diff
==
--- tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java Wed Oct 28 
23:21:41 2015
@@ -283,7 +283,8 @@ public class Tika {
  */
 public String detect(File file) throws IOException {
 Metadata metadata = new Metadata();
-try (InputStream stream = TikaInputStream.get(file, metadata)) {
+try (@SuppressWarnings("deprecation")
+InputStream stream = TikaInputStream.get(file, metadata)) {
 return detect(stream, metadata);
 }
 }
@@ -399,7 +400,7 @@ public class Tika {
  * the time when the {@link Reader#close()} method is called.
  *
  * @param stream the document to be parsed
- * @param metadata document metadata
+ * @param metadata where document's metadata will be populated
  * @return extracted text content
  * @throws IOException if the document can not be read or parsed
  */
@@ -427,32 +428,62 @@ public class Tika {
 
 /**
  * Parses the file at the given path and returns the extracted text 
content.
+ * 
+ * Metadata information extracted from the document is returned in 
+ *  the supplied metadata instance.
  *
  * @param path the path of the file to be parsed
+ * @param metadata where document's metadata will be populated
  * @return extracted text content
  * @throws IOException if the file can not be read or parsed
  */
-public Reader parse(Path path) throws IOException {
-Metadata metadata = new Metadata();
+public Reader parse(Path path, Metadata metadata) throws IOException {
 InputStream stream = TikaInputStream.get(path, metadata);
 return parse(stream, metadata);
 }
+
+/**
+ * Parses the file at the given path and returns the extracted text 
content.
+ *
+ * @param path the path of the file to be parsed
+ * @return extracted text content
+ * @throws IOException if the file can not be read or parsed
+ */
+public Reader parse(Path path) throws IOException {
+return parse(path, new Metadata());
+}
 
 /**
  * Parses the given file and returns the extracted text content.
+ * 
+ * Metadata information extracted from the document is returned in 
+ *  the supplied metadata instance.
  *
  * @param file the file to be parsed
+ * @param metadata where document's metadata will be populated
  * @return extracted text content
  * @throws IOException if the file can not be read or parsed
  * @see #parse(Path)
  */
-public Reader parse(File file) throws IOException {
-Metadata metadata = new Metadata();
+public Reader parse(File file, Metadata metadata) throws IOException {
+@SuppressWarnings("deprecation")
 InputStream stream = TikaInputStream.get(file, metadata);
 return parse(stream, metadata);
 }
 
 /**
+ * Parses the given file and returns the extracted text content.
+ *
+ * @param file the file to be parsed
+ * @return extracted text content
+ * @throws IOException if the file can not be read or parsed
+ * @see #parse(Path)
+ */
+public Reader parse(File file) throws IOException {
+return parse(file, new Metadata());
+}
+
+/**
  * Parses the resource at the given URL and returns the extracted
  * text content.
  *
@@ -606,6 +637,7 @@ public class Tika {
  */
 public String parseToString(File file) throws IOException, TikaException {
 Metadata metadata = new Metadata();
+@SuppressWarnings("deprecation")
 InputStream stream = TikaInputStream.get(file, metadata);

svn commit: r1710536 - in /tika/site/publish: 1.11/formats.html 1.12/ 1.12/examples.html 1.12/formats.html

2015-10-26 Thread nick
Author: nick
Date: Mon Oct 26 09:24:17 2015
New Revision: 1710536

URL: http://svn.apache.org/viewvc?rev=1710536=rev
Log:
Publish site changes

Added:
tika/site/publish/1.12/
tika/site/publish/1.12/examples.html
tika/site/publish/1.12/formats.html
Modified:
tika/site/publish/1.11/formats.html

Modified: tika/site/publish/1.11/formats.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.11/formats.html?rev=1710536=1710535=1710536=diff
==
--- tika/site/publish/1.11/formats.html (original)
+++ tika/site/publish/1.11/formats.html Mon Oct 26 09:24:17 2015
@@ -208,7 +208,423 @@
 The JackcessParser
 is able to extract metadata and content in a tabular form, from Microsoft 
Access database files.
 
 Full list of supported 
formats:
-TODO Populate this at release time
+
+org.apache.tika.parser.asm.ClassParser
+
+application/java-vm
+org.apache.tika.parser.audio.AudioParser
+
+audio/x-wav
+audio/x-aiff
+audio/basic
+org.apache.tika.parser.audio.MidiParser
+
+application/x-midi
+audio/midi
+org.apache.tika.parser.chm.ChmParser
+
+application/vnd.ms-htmlhelp
+application/chm
+application/x-chm
+org.apache.tika.parser.code.SourceCodeParser
+
+text/x-java-source
+text/x-c++src
+text/x-groovy
+org.apache.tika.parser.crypto.Pkcs7Parser
+
+application/pkcs7-signature
+application/pkcs7-mime
+org.apache.tika.parser.dif.DIFParser
+
+application/dif+xml
+org.apache.tika.parser.dwg.DWGParser
+
+image/vnd.dwg
+org.apache.tika.parser.epub.EpubParser
+
+application/x-ibooks+zip
+application/epub+zip
+org.apache.tika.parser.executable.ExecutableParser
+
+application/x-elf
+application/x-sharedlib
+application/x-executable
+application/x-msdownload
+application/x-coredump
+application/x-object
+org.apache.tika.parser.external.ExternalParser
+
+video/mp4
+video/avi
+video/mpeg
+video/x-msvideo
+org.apache.tika.parser.feed.FeedParser
+
+application/atom+xml
+application/rss+xml
+org.apache.tika.parser.font.AdobeFontMetricParser
+
+application/x-font-adobe-metric
+org.apache.tika.parser.font.TrueTypeParser
+
+application/x-font-ttf
+org.apache.tika.parser.gdal.GDALParser
+
+image/x-ozi
+application/x-snodas
+application/x-ecrg-toc
+image/envisat
+application/x-doq2
+application/x-rs2
+application/x-gsag
+application/x-ers
+application/fits
+application/x-pnm
+image/adrg
+image/gif
+application/x-generic-bin
+application/x-bt
+application/x-zmap
+application/x-hdf
+image/eir
+application/x-ace2
+application/grass-ascii-grid
+application/x-l1b
+application/x-gsc
+image/jp2
+image/hfa
+image/fits
+image/raster
+application/x-epsilon
+image/x-srp
+application/x-envi-hdr
+application/x-ctable2
+application/x-srtmhgt
+application/jaxa-pal-sar
+application/x-ndf
+application/sdts-raster
+application/x-gtx
+application/x-rst
+application/x-xyz
+application/terragen
+application/x-gs7bg
+image/arg
+application/elas
+image/big-gif
+application/x-geo-pdf
+application/x-ctg
+application/aaigrid
+application/x-lcp
+application/x-nwt-grc
+application/x-fast
+application/x-usgs-dem
+application/x-nwt-grd
+application/x-ingr
+application/x-envi
+application/x-rik
+application/x-blx
+application/x-wcs
+image/ceos
+application/x-ngs-geoid
+application/x-r
+image/bmp
+application/x-http
+application/x-til
+application/x-pds
+application/x-rasterlite
+application/x-gmt
+application/x-msgn
+image/ilwis
+application/aig
+application/x-rmf
+image/x-hdf5-image
+image/sar-ceos
+application/x-kro
+application/vrt
+application/x-netcdf
+image/nitf
+image/png
+image/geotiff
+image/x-mff2
+application/x-webp
+image/ida
+application/x-gsbg
+application/x-ntv2
+application/x-coasp
+application/x-los-las
+application/x-tsx
+application/x-bag
+image/fit
+application/x-lan
+application/x-map
+image/jpeg
+application/x-dods
+application/jdem
+application/gff
+application/x-isis2
+application/x-isis3
+application/xpm
+application/x-pcidsk
+application/x-gxf
+application/x-wms
+application/x-cosar
+image/bsb
+application/x-grib
+application/x-mbtiles
+application/x-cappi
+application/x-rpf-toc
+image/x-mff
+image/x-dimap
+image/x-pcraster
+application/x-ppi
+application/x-sdat
+application/pcisdk
+application/x-cpg
+application/leveller
+image/sgi
+image/x-fujibas
+image/x-airsar
+application/x-e00-grid
+application/x-kml
+application/x-p-aux
+application/x-doq1
+application/dted
+application/x-dipex
+org.apache.tika.parser.geo.topic.GeoParser
+
+application/geotopic
+org.apache.tika.parser.geoinfo.GeographicInformationParser
+
+text/iso19139+xml
+org.apache.tika.parser.grib.GribParser
+
+application/x-grib2
+org.apache.tika.parser.hdf.HDFParser
+
+application/x-hdf
+org.apache.tika.parser.html.HtmlParser
+
+application/x-asp
+application/xhtml+xml
+application/vnd.wap.xhtml+xml
+text/html
+org.apache.tika.parser.image.BPGParser
+
+image/bpg
+image/x-bpg
+org.apache.tika.parser.image.ImageParser
+
+image/x-ms-bmp
+image/png
+image/x-icon
+image/vnd.wap.wbmp
+image/gif

svn commit: r1708950 - /tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG.jp2

2015-10-16 Thread nick
Author: nick
Date: Fri Oct 16 10:32:28 2015
New Revision: 1708950

URL: http://svn.apache.org/viewvc?rev=1708950=rev
Log:
Test JP2 (JPEG2000) file from Andreas Hirtzel from TIKA-1773

Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG.jp2   
(with props)

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG.jp2
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG.jp2?rev=1708950=auto
==
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG.jp2
--
svn:mime-type = application/octet-stream




svn commit: r1708940 - /tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

2015-10-16 Thread nick
Author: nick
Date: Fri Oct 16 10:04:47 2015
New Revision: 1708940

URL: http://svn.apache.org/viewvc?rev=1708940=rev
Log:
TIKA-1772 WebVTT mime entry from Alexander Widera

Modified:

tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

Modified: 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1708940=1708939=1708940=diff
==
--- 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
(original)
+++ 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
Fri Oct 16 10:04:47 2015
@@ -5538,6 +5538,13 @@
 
   
 
+  
+<_comment>Web Video Text Tracks Format
+WebVTT
+
+
+  
+
   
 <_comment>AWK script
 




svn commit: r1708975 - /tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java

2015-10-16 Thread nick
Author: nick
Date: Fri Oct 16 12:33:54 2015
New Revision: 1708975

URL: http://svn.apache.org/viewvc?rev=1708975=rev
Log:
JPEG2000 (jp2) detection tests

Modified:

tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1708975=1708974=1708975=diff
==
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
(original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
Fri Oct 16 12:33:54 2015
@@ -394,6 +394,10 @@ public class TestMimeTypes {
 assertTypeByName("image/jpeg", "x.jif");
 assertTypeByName("image/jpeg", "x.jfif");
 assertTypeByName("image/jpeg", "x.jfi");
+
+assertType("image/jp2", "testJPEG.jp2");
+assertTypeByData("image/jp2", "testJPEG.jp2");
+assertTypeByName("image/jp2", "x.jp2");
 }
 
 @Test




svn commit: r1705181 - /tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java

2015-09-24 Thread nick
Author: nick
Date: Thu Sep 24 22:38:38 2015
New Revision: 1705181

URL: http://svn.apache.org/viewvc?rev=1705181=rev
Log:
Expand the Tika Config dumping support for parsers

Modified:

tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java

Modified: 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java?rev=1705181=1705180=1705181=diff
==
--- 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
 (original)
+++ 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
 Thu Sep 24 22:38:38 2015
@@ -24,6 +24,7 @@ import java.io.OutputStreamWriter;
 import java.io.StringWriter;
 import java.io.Writer;
 import java.nio.charset.Charset;
+import java.util.Collections;
 import java.util.List;
 import java.util.Set;
 import java.util.TreeSet;
@@ -97,8 +98,8 @@ public class DumpTikaConfigExample {
 }
 
 private void addTranslator(Mode mode, Element rootElement, Document doc, 
TikaConfig config) {
-// TikaConfig only reads the first translator from the list,
-//  but it looks like it expects a list
+// Unlike the other entries, TikaConfig only wants one of
+//  these, and no outer  list
 Translator translator = config.getTranslator();
 if (mode == Mode.MINIMAL && translator instanceof DefaultTranslator) {
 Node mimeComment = doc.createComment(
@@ -160,54 +161,65 @@ public class DumpTikaConfigExample {
 } else if (mode == Mode.MINIMAL) {
 mode = Mode.CURRENT;
 }
-addParsers(mode, rootElement, doc, parser);
+
+Element parsersElement = doc.createElement("parsers");
+rootElement.appendChild(parsersElement);
+
+addParser(mode, parsersElement, doc, parser);
 }
-private void addParsers(Mode mode, Element rootElement, Document doc, 
Parser parser) throws Exception {
-Parser realParser = parser;
+private void addParser(Mode mode, Element rootElement, Document doc, 
Parser parser) throws Exception {
+// If the parser is decorated, is it a kind where we output the parser 
inside?
+ParserDecorator decoration = null;
 if (parser instanceof ParserDecorator) {
-realParser = ((ParserDecorator)parser).getWrappedParser();
+if 
(parser.getClass().getName().startsWith(ParserDecorator.class.getName()+"$")) {
+decoration = ((ParserDecorator)parser);
+parser = decoration.getWrappedParser();
+}
 }
 
-List children = null;
-if (mode == Mode.CURRENT && realParser instanceof DefaultParser) {
-// Don't output any children
-// TODO List excluded children
-} else if (realParser instanceof CompositeParser) {
-children = ((CompositeParser)realParser).getAllComponentParsers();
-if (realParser instanceof DefaultParser || parser == realParser) {
-realParser = null;
+boolean outputParser = true;
+List children = Collections.emptyList();
+if (mode == Mode.CURRENT && parser instanceof DefaultParser) {
+// Only output the parser, not the children
+} else if (parser instanceof CompositeParser) {
+children = ((CompositeParser)parser).getAllComponentParsers();
+// Special case for a naked composite
+if (parser.getClass().equals(CompositeParser.class)) {
+outputParser = false;
+}
+// Special case for making Default to static
+if (mode == Mode.STATIC && parser instanceof DefaultParser) {
+outputParser = false;
 }
 }
 
-Element parsersElement = doc.createElement("parsers");
-rootElement.appendChild(parsersElement);
-Element addParserTo = parsersElement;
-
-if (realParser != null) {
-addParserTo = addParser(addParserTo, doc, parser, realParser);
+if (outputParser) {
+rootElement = addParser(rootElement, doc, parser, decoration);
 }
-if (children != null && !children.isEmpty()) {
-for (Parser p : children) {
-addParser(addParserTo, doc, p, p);
-}
+for (Parser childParser : children) {
+addParser(mode, rootElement, doc, childParser);
 }
+// TODO Parser Exclusions
 }
-private Element addParser(Element rootElement, Document doc, Parser 
parser, Parser realParser) throws Exception {
+private Element addParser(Element rootElement, Document doc, Parser 
parser, ParserDecor

svn commit: r1705191 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java tika-core/src/main/java/org/apache/tika/config/TikaConfig.java tika-example/src/main/java/or

2015-09-24 Thread nick
Author: nick
Date: Thu Sep 24 22:59:15 2015
New Revision: 1705191

URL: http://svn.apache.org/viewvc?rev=1705191=rev
Log:
Expose the ServiceLoader used by TikaConfig, and use that to support 
serialising the service loader config xml section

Modified:

tika/trunk/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java

tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java?rev=1705191=1705190=1705191=diff
==
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java 
(original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java 
Thu Sep 24 22:59:15 2015
@@ -46,6 +46,10 @@ public interface LoadErrorHandler {
 LoadErrorHandler IGNORE = new LoadErrorHandler() {
 public void handleLoadError(String classname, Throwable throwable) {
 }
+@Override
+public String toString() {
+return "IGNORE";
+}
 };
 
 /**
@@ -57,6 +61,10 @@ public interface LoadErrorHandler {
 Logger.getLogger(classname).log(
 Level.WARNING, "Unable to load " + classname, throwable);
 }
+@Override
+public String toString() {
+return "WARN";
+}
 };
 
 /**
@@ -68,6 +76,9 @@ public interface LoadErrorHandler {
 public void handleLoadError(String classname, Throwable throwable) {
 throw new RuntimeException("Unable to load " + classname, 
throwable);
 }
+@Override
+public String toString() {
+return "THROW";
+}
 };
-
 }

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1705191=1705190=1705191=diff
==
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
Thu Sep 24 22:59:15 2015
@@ -79,6 +79,7 @@ public class TikaConfig {
 private static Translator getDefaultTranslator(ServiceLoader loader) {
 return new DefaultTranslator(loader);
 }
+private final ServiceLoader serviceLoader;
 private final CompositeParser parser;
 private final CompositeDetector detector;
 private final Translator translator;
@@ -143,6 +144,7 @@ public class TikaConfig {
 this.detector = detectorLoader.loadOverall(element, mimeTypes, loader);
 this.parser = parserLoader.loadOverall(element, mimeTypes, loader);
 this.translator = translatorLoader.loadOverall(element, mimeTypes, 
loader);
+this.serviceLoader = loader;
 }
 
 /**
@@ -159,7 +161,7 @@ public class TikaConfig {
  */
 public TikaConfig(ClassLoader loader)
 throws MimeTypeException, IOException {
-ServiceLoader serviceLoader = new ServiceLoader(loader);
+this.serviceLoader = new ServiceLoader(loader);
 this.mimeTypes = getDefaultMimeTypes(loader);
 this.detector = getDefaultDetector(mimeTypes, serviceLoader);
 this.parser = getDefaultParser(mimeTypes, serviceLoader);
@@ -184,7 +186,7 @@ public class TikaConfig {
  * @throws TikaException if problem with MimeTypes or parsing XML config
  */
 public TikaConfig() throws TikaException, IOException {
-ServiceLoader loader = new ServiceLoader();
+this.serviceLoader = new ServiceLoader();
 
 String config = System.getProperty("tika.config");
 if (config == null) {
@@ -193,9 +195,9 @@ public class TikaConfig {
 
 if (config == null) {
 this.mimeTypes = 
getDefaultMimeTypes(ServiceLoader.getContextClassLoader());
-this.parser = getDefaultParser(mimeTypes, loader);
-this.detector = getDefaultDetector(mimeTypes, loader);
-this.translator = getDefaultTranslator(loader);
+this.parser = getDefaultParser(mimeTypes, serviceLoader);
+this.detector = getDefaultDetector(mimeTypes, serviceLoader);
+this.translator = getDefaultTranslator(serviceLoader);
 } else {
 // Locate the given configuration file
 InputStream stream = null;
@@ -210,7 +212,7 @@ public class TikaConfig {
 }
 }
 if (stream == null) {
-stream = loader.getResourceAsStream(config);
+ 

svn commit: r1704934 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/language/translate/ tika-example/src/main/java/org/apache/tika/example/ tika-example/src/test/java/org/apache/tika/exampl

2015-09-23 Thread nick
Author: nick
Date: Wed Sep 23 21:04:08 2015
New Revision: 1704934

URL: http://svn.apache.org/viewvc?rev=1704934=rev
Log:
TIKA-1657 Update the example of dumping a Tika Config to support different 
output modes, for Translators and Detectors

Modified:

tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java

tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java

tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java?rev=1704934=1704933=1704934=diff
==
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java
 (original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java
 Wed Sep 23 21:04:08 2015
@@ -99,6 +99,19 @@ public class DefaultTranslator implement
 }
 throw new TikaException("No translators currently available");
 }
+
+/**
+ * Returns all available translators
+ */
+public List getTranslators() {
+return getDefaultTranslators(loader);
+}
+/**
+ * Returns the current translator
+ */
+public Translator getTranslator() {
+return getFirstAvailable(loader);
+}
 
 public boolean isAvailable() {
 return getFirstAvailable(loader) != null;

Modified: 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java?rev=1704934=1704933=1704934=diff
==
--- 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
 (original)
+++ 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
 Wed Sep 23 21:04:08 2015
@@ -17,7 +17,8 @@
 
 package org.apache.tika.example;
 
-import java.io.File;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
@@ -29,6 +30,7 @@ import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.TreeSet;
+
 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.transform.OutputKeys;
@@ -38,6 +40,7 @@ import javax.xml.transform.dom.DOMSource
 import javax.xml.transform.stream.StreamResult;
 
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.CompositeDetector;
 import org.apache.tika.detect.DefaultDetector;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
@@ -51,8 +54,6 @@ import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.Node;
 
-import static java.nio.charset.StandardCharsets.UTF_8;
-
 
 /**
  * This class shows how to dump a TikaConfig object to a configuration file.
@@ -70,21 +71,21 @@ public class DumpTikaConfigExample {
  * @param writer writer to which to write
  * @throws Exception
  */
-public void dump(TikaConfig config, Writer writer, String encoding) throws 
Exception {
+public void dump(TikaConfig config, Mode mode, Writer writer, String 
encoding) throws Exception {
 DocumentBuilderFactory docFactory = 
DocumentBuilderFactory.newInstance();
 DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
+
 // root elements
 Document doc = docBuilder.newDocument();
 Element rootElement = doc.createElement("properties");
 
 doc.appendChild(rootElement);
-addMimeComment(rootElement, doc);
-addTranslator(rootElement, doc, config);
-addDetectors(rootElement, doc, config);
-addParsers(rootElement, doc, config);
+addMimeComment(mode, rootElement, doc);
+addTranslator(mode, rootElement, doc, config);
+addDetectors(mode, rootElement, doc, config);
+addParsers(mode, rootElement, doc, config);
 
-
-//now write
+// now write
 TransformerFactory transformerFactory = 
TransformerFactory.newInstance();
 Transformer transformer = transformerFactory.newTransformer();
 transformer.setOutputProperty(OutputKeys.INDENT, "yes");
@@ -96,33 +97,50 @@ public class DumpTikaConfigExample {
 transformer.transform(source, result);
 }
 
-private void addTranslator(Element rootElement, Document doc, TikaConfig 
config) {
-//TikaConfig only reads the first translator from the list,
-//but it looks like it ex

svn commit: r1701201 - /tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

2015-09-04 Thread nick
Author: nick
Date: Fri Sep  4 09:56:49 2015
New Revision: 1701201

URL: http://svn.apache.org/r1701201
Log:
TIKA-1728 Fix the HWP v5 mime type hierarchy

Modified:

tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

Modified: 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1701201=1701200=1701201=diff
==
--- 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
(original)
+++ 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
Fri Sep  4 09:56:49 2015
@@ -3242,7 +3242,7 @@
   
   
 <_comment>Hangul Word Processor File v5
-
+
   
 
   




svn commit: r1700984 - in /tika/trunk/tika-parsers/src/test/resources/test-documents: testHWP_3.0.hwp testHWP_5.0.hwp

2015-09-03 Thread nick
Author: nick
Date: Thu Sep  3 10:56:48 2015
New Revision: 1700984

URL: http://svn.apache.org/r1700984
Log:
Test HWP files from Mungeol Heo from TIKA-1728

Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testHWP_3.0.hwp   
(with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testHWP_5.0.hwp   
(with props)

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testHWP_3.0.hwp
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testHWP_3.0.hwp?rev=1700984=auto
==
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testHWP_3.0.hwp
--
svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testHWP_5.0.hwp
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testHWP_5.0.hwp?rev=1700984=auto
==
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testHWP_5.0.hwp
--
svn:mime-type = application/octet-stream




svn commit: r1696817 - /tika/trunk/tika-bundle/pom.xml

2015-08-20 Thread nick
Author: nick
Date: Thu Aug 20 17:08:26 2015
New Revision: 1696817

URL: http://svn.apache.org/r1696817
Log:
TIKA-1711 As Tika needs 1.7, remove 1.6 specific bits of the bundle build. 
Patch from Yaniv Kunda

Modified:
tika/trunk/tika-bundle/pom.xml

Modified: tika/trunk/tika-bundle/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-bundle/pom.xml?rev=1696817r1=1696816r2=1696817view=diff
==
--- tika/trunk/tika-bundle/pom.xml (original)
+++ tika/trunk/tika-bundle/pom.xml Thu Aug 20 17:08:26 2015
@@ -387,56 +387,45 @@
   skiptrue/skip
 /configuration
   /plugin
-/plugins
-  /build
 
-  profiles
-profile
-  idjava6/id
-  activation
-jdk[1.6,)/jdk
-  /activation
-  build
-plugins
-  plugin
-artifactIdmaven-assembly-plugin/artifactId
-executions
-  execution
-phasepre-integration-test/phase
-goals
-  goalsingle/goal
-/goals
-configuration
-  descriptortest-bundles.xml/descriptor
-  finalNametest/finalName
-  attachfalse/attach
-/configuration
-  /execution
-/executions
-  /plugin
-  plugin
-artifactIdmaven-failsafe-plugin/artifactId
-version2.10/version
-executions
-  execution
-goals
-  goalintegration-test/goal
-  goalverify/goal
-/goals
-  /execution
-/executions
+  plugin
+artifactIdmaven-assembly-plugin/artifactId
+executions
+  execution
+phasepre-integration-test/phase
+goals
+  goalsingle/goal
+/goals
 configuration
-  systemPropertyVariables
-org.ops4j.pax.logging.DefaultServiceLog.level
-  WARN
-/org.ops4j.pax.logging.DefaultServiceLog.level
-  /systemPropertyVariables
+  descriptortest-bundles.xml/descriptor
+  finalNametest/finalName
+  attachfalse/attach
 /configuration
-  /plugin
-/plugins
-  /build
-/profile
-  /profiles
+  /execution
+/executions
+  /plugin
+
+  plugin
+artifactIdmaven-failsafe-plugin/artifactId
+version2.10/version
+executions
+  execution
+goals
+  goalintegration-test/goal
+  goalverify/goal
+/goals
+  /execution
+/executions
+configuration
+  systemPropertyVariables
+org.ops4j.pax.logging.DefaultServiceLog.level
+  WARN
+/org.ops4j.pax.logging.DefaultServiceLog.level
+  /systemPropertyVariables
+/configuration
+  /plugin
+/plugins
+  /build
 
   organization
 nameThe Apache Software Founation/name




svn commit: r1696746 - in /tika/trunk/tika-parsers/src/test/java/org/apache/tika: embedder/ mime/ parser/ parser/chm/ parser/code/ parser/geo/topic/ parser/html/ parser/image/ parser/jdbc/ parser/mail

2015-08-20 Thread nick
Author: nick
Date: Thu Aug 20 09:59:17 2015
New Revision: 1696746

URL: http://svn.apache.org/r1696746
Log:
TIKA-1710 patch from Yaniv Kunda - Use Commons IO instead of the Tika Core IO 
copies, and java.nio.charset.StandardCharsets

Modified:

tika/trunk/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmItspHeader.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxState.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxcControlData.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxcResetTable.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestParameters.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestPmglHeader.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/WebPParserTest.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mock/MockParserTest.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/MpegStreamTest.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/Latin1StringsParserTest.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java?rev=1696746r1=1696745r2=1696746view=diff
==
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java
 Thu Aug 20 09:59:17 2015
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.embedder;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
@@ -38,7 +39,6 @@ import java.util.Locale;
 import java.util.Map;
 
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.IOUtils;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
@@ -59,7 +59,7 @@ public class ExternalEmbedderTest {
 
 protected static final DateFormat EXPECTED_METADATA_DATE_FORMATTER =
 new SimpleDateFormat(-MM-dd'T'HH:mm:ss, Locale.ROOT);
-protected static final String DEFAULT_CHARSET = IOUtils.UTF_8.name();
+protected static final String DEFAULT_CHARSET = UTF_8.name();
 private static final String COMMAND_METADATA_ARGUMENT_DESCRIPTION = 
dc:description;
 private static final String TEST_TXT_PATH = /test-documents/testTXT.txt;
 

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1696746r1=1696745r2=1696746view=diff
==
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
(original)
+++ 
tika/trunk/tika

svn commit: r1696749 - in /tika/trunk/tika-batch: ./ src/main/java/org/apache/tika/batch/ src/main/java/org/apache/tika/batch/fs/ src/main/java/org/apache/tika/batch/fs/strawman/ src/test/java/org/apa

2015-08-20 Thread nick
Author: nick
Date: Thu Aug 20 10:02:19 2015
New Revision: 1696749

URL: http://svn.apache.org/r1696749
Log:
TIKA-1710 patch from Yaniv Kunda - Use Commons IO instead of the Tika Core IO 
copies, and java.nio.charset.StandardCharsets

Modified:
tika/trunk/tika-batch/pom.xml
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcess.java

tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/Interrupter.java

tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/BasicTikaFSConsumer.java

tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/FSBatchProcessCLI.java

tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java

tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/strawman/StrawManTikaAppDriver.java

tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/CommandLineParserBuilderTest.java

tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/RecursiveParserWrapperFSConsumerTest.java

tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchDriverTest.java

tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java

tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/FSBatchTestBase.java

tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/HandlerBuilderTest.java

tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/StringStreamGobbler.java

Modified: tika/trunk/tika-batch/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-batch/pom.xml?rev=1696749r1=1696748r2=1696749view=diff
==
--- tika/trunk/tika-batch/pom.xml (original)
+++ tika/trunk/tika-batch/pom.xml Thu Aug 20 10:02:19 2015
@@ -67,6 +67,11 @@
   version${cli.version}/version
 /dependency
 dependency
+  groupIdcommons-io/groupId
+  artifactIdcommons-io/artifactId
+  version${commons.io.version}/version
+/dependency
+dependency
   groupIdorg.apache.tika/groupId
   artifactIdtika-core/artifactId
   version${project.version}/version
@@ -85,12 +90,6 @@
   artifactIdjunit/artifactId
   scopetest/scope
 /dependency
-dependency
-  groupIdcommons-io/groupId
-  artifactIdcommons-io/artifactId
-  scopetest/scope
-  version${commons.io.version}/version
-/dependency
   /dependencies
 
   build

Modified: 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcess.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcess.java?rev=1696749r1=1696748r2=1696749view=diff
==
--- tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcess.java 
(original)
+++ tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcess.java 
Thu Aug 20 10:02:19 2015
@@ -31,10 +31,10 @@ import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
 
-import org.apache.tika.io.IOUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
 
 /**
  * This is the main processor class for a single process.
@@ -134,7 +134,7 @@ public class BatchProcess implements Cal
 //System.err should be redirected to System.out
 PrintStream sysErr = System.err;
 try {
-outputStreamWriter = new PrintStream(sysErr, true, 
IOUtils.UTF_8.toString());
+outputStreamWriter = new PrintStream(sysErr, true, 
UTF_8.toString());
 } catch (IOException e) {
 throw new RuntimeException(Can't redirect streams);
 }

Modified: 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java?rev=1696749r1=1696748r2=1696749view=diff
==
--- 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java
 (original)
+++ 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java
 Thu Aug 20 10:02:19 2015
@@ -29,10 +29,12 @@ import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
 
-import org.apache.tika.io.IOUtils;
+import org.apache.commons.io.IOUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 public class BatchProcessDriverCLI {
 
 /**
@@ -285,7 +287,7 @@ public class BatchProcessDriverCLI {
 private BufferedReader reader;
 
 private InterruptWatcher(InputStream is) {
-reader = new BufferedReader(new InputStreamReader

svn commit: r1696745 [1/2] - in /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser: audio/ chm/accessor/ chm/core/ code/ crypto/ ctakes/ dif/ envi/ epub/ feed/ gdal/ geo/topic/ hdf/ html/ i

2015-08-20 Thread nick
Author: nick
Date: Thu Aug 20 09:51:44 2015
New Revision: 1696745

URL: http://svn.apache.org/r1696745
Log:
TIKA-1710 patch from Yaniv Kunda - Use Commons IO instead of the Tika Core IO 
copies, and java.nio.charset.StandardCharsets

Modified:

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/audio/MidiParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/crypto/Pkcs7Parser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/dif/DIFParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/feed/FeedParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/gdal/GDALParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/hdf/HDFParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/isatab/ISATabUtils.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3DBParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java

svn commit: r1696745 [2/2] - in /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser: audio/ chm/accessor/ chm/core/ code/ crypto/ ctakes/ dif/ envi/ epub/ feed/ gdal/ geo/topic/ hdf/ html/ i

2015-08-20 Thread nick
Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java?rev=1696745r1=1696744r2=1696745view=diff
==
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
 Thu Aug 20 09:51:44 2015
@@ -27,6 +27,7 @@ import java.io.UnsupportedEncodingExcept
 import java.util.Locale;
 import java.util.concurrent.atomic.AtomicInteger;
 
+import org.apache.commons.io.FilenameUtils;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.DocumentEntry;
 import org.apache.poi.poifs.filesystem.DocumentInputStream;
@@ -35,7 +36,6 @@ import org.apache.poi.poifs.filesystem.N
 import org.apache.poi.poifs.filesystem.Ole10Native;
 import org.apache.poi.poifs.filesystem.Ole10NativeException;
 import org.apache.poi.util.IOUtils;
-import org.apache.tika.io.FilenameUtils;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.RTFMetadata;

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=1696745r1=1696744r2=1696745view=diff
==
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java 
(original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java 
Thu Aug 20 09:51:44 2015
@@ -21,8 +21,8 @@ import java.io.InputStream;
 import java.util.Collections;
 import java.util.Set;
 
+import org.apache.commons.io.input.TaggedInputStream;
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TaggedInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsParser.java?rev=1696745r1=1696744r2=1696745view=diff
==
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsParser.java
 Thu Aug 20 09:51:44 2015
@@ -30,7 +30,6 @@ import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.IOUtils;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -41,6 +40,8 @@ import org.apache.tika.sax.XHTMLContentH
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 /**
  * Parser that uses the strings (or strings-alternative) command to find the
  * printable strings in a object, or other binary, file
@@ -267,7 +268,7 @@ public class StringsParser extends Abstr
int totalBytes = 0;
 
try {
-   reader = new BufferedReader(new 
InputStreamReader(stream, IOUtils.UTF_8));
+   reader = new BufferedReader(new 
InputStreamReader(stream, UTF_8));
 
int n = 0;
while ((n = reader.read(buffer)) != -1) {
@@ -320,7 +321,7 @@ public class StringsParser extends Abstr
String fileOutput = null;
 
try {
-   reader = new BufferedReader(new InputStreamReader(out, 
IOUtils.UTF_8));
+   reader = new BufferedReader(new InputStreamReader(out, 
UTF_8));
fileOutput = reader.readLine();
 
} catch (IOException ioe) {

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=1696745r1=1696744r2=1696745view=diff
==
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java 
(original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java 
Thu Aug 20 09:51:44 2015
@@ -22,10 +22,10 @@ import java.nio.charset.Charset;
 import java.util.Collections;
 import java.util.Set;
 
+import org.apache.commons.io.input.CloseShieldInputStream;
 import 

svn commit: r1696836 - in /tika/trunk: ./ tika-parent/ tika-parsers/src/main/java/org/apache/tika/parser/pkg/ tika-parsers/src/test/java/org/apache/tika/parser/pkg/

2015-08-20 Thread nick
Author: nick
Date: Thu Aug 20 18:31:15 2015
New Revision: 1696836

URL: http://svn.apache.org/r1696836
Log:
TIKA-1718 Upgrade to Commons Compress 1.10, and fix various TODOs that this 
permits

Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parent/pom.xml

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1696836r1=1696835r2=1696836view=diff
==
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Thu Aug 20 18:31:15 2015
@@ -12,6 +12,9 @@ Release 1.11 - Current Development
   * Corrected Tika Config XML detector defintion explicit loading 
 of MimeTypes (TIKA-1708)
 
+  * Upgraded to Commons Compress 1.10, which enables zlib compressed
+archives support (TIKA-1718)
+
 
 Release 1.10 - 8/1/2015
 

Modified: tika/trunk/tika-parent/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parent/pom.xml?rev=1696836r1=1696835r2=1696836view=diff
==
--- tika/trunk/tika-parent/pom.xml (original)
+++ tika/trunk/tika-parent/pom.xml Thu Aug 20 18:31:15 2015
@@ -301,7 +301,7 @@
 maven.compiler.source1.7/maven.compiler.source
 maven.compiler.target1.7/maven.compiler.target
 
project.reporting.outputEncoding${project.build.sourceEncoding}/project.reporting.outputEncoding
-commons.compress.version1.9/commons.compress.version
+commons.compress.version1.10/commons.compress.version
 commons.io.version2.4/commons.io.version
 slf4j.version1.7.12/slf4j.version
   /properties

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java?rev=1696836r1=1696835r2=1696836view=diff
==
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
 Thu Aug 20 18:31:15 2015
@@ -58,11 +58,10 @@ public class CompressorParser extends Ab
 private static final MediaType GZIP_ALT = MediaType.application(x-gzip);
 private static final MediaType XZ = MediaType.application(x-xz);
 private static final MediaType PACK = 
MediaType.application(application/x-java-pack200);
-// TODO Not yet supported by CompressorStreamFactory, see COMPRESS-316
 private static final MediaType ZLIB = MediaType.application(zlib);
 
 private static final SetMediaType SUPPORTED_TYPES =
-MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, XZ, PACK);
+MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, XZ, PACK, ZLIB);
 
 static MediaType getMediaType(CompressorInputStream stream) {
 // TODO Add support for the remaining CompressorInputStream formats:
@@ -103,14 +102,14 @@ public class CompressorParser extends Ab
 
 CompressorInputStream cis;
 try {
-CompressorStreamFactory factory = new CompressorStreamFactory();
 CompressorParserOptions options =
  context.get(CompressorParserOptions.class, new 
CompressorParserOptions() {
  public boolean decompressConcatenated(Metadata metadata) {
  return false;
  }
  });
-
factory.setDecompressConcatenated(options.decompressConcatenated(metadata));
+CompressorStreamFactory factory = 
+new 
CompressorStreamFactory(options.decompressConcatenated(metadata));
 cis = factory.createCompressorInputStream(stream);
 } catch (CompressorException e) {
 throw new TikaException(Unable to uncompress document stream, e);

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=1696836r1=1696835r2=1696836view=diff
==
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
 Thu Aug 20 18:31:15 2015
@@ -24,6 +24,7 @@ import java.io.InputStream;
 import java.util.Date;
 import java.util.Set;
 
+import org.apache.commons.compress.PasswordRequiredException;
 import org.apache.commons.compress.archivers.ArchiveEntry;
 import

svn commit: r1696833 - in /tika/trunk: tika-batch/pom.xml tika-parent/pom.xml tika-parsers/pom.xml

2015-08-20 Thread nick
Author: nick
Date: Thu Aug 20 18:08:44 2015
New Revision: 1696833

URL: http://svn.apache.org/r1696833
Log:
TIKA-1718 Enforce a consistent commons compress version between components

Modified:
tika/trunk/tika-batch/pom.xml
tika/trunk/tika-parent/pom.xml
tika/trunk/tika-parsers/pom.xml

Modified: tika/trunk/tika-batch/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-batch/pom.xml?rev=1696833r1=1696832r2=1696833view=diff
==
--- tika/trunk/tika-batch/pom.xml (original)
+++ tika/trunk/tika-batch/pom.xml Thu Aug 20 18:08:44 2015
@@ -36,9 +36,6 @@
 
   properties
 cli.version1.2/cli.version
-!-- sync version with tika-server or move to parent? --
-compress.version1.9/compress.version
-!-- sync with tika-parsers or move to parent? --
   /properties
 
   dependencies
@@ -55,7 +52,7 @@
 dependency
   groupIdorg.apache.commons/groupId
   artifactIdcommons-compress/artifactId
-  version${compress.version}/version
+  version${commons.compress.version}/version
 /dependency
 dependency
   groupIdorg.slf4j/groupId

Modified: tika/trunk/tika-parent/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parent/pom.xml?rev=1696833r1=1696832r2=1696833view=diff
==
--- tika/trunk/tika-parent/pom.xml (original)
+++ tika/trunk/tika-parent/pom.xml Thu Aug 20 18:08:44 2015
@@ -301,6 +301,7 @@
 maven.compiler.source1.7/maven.compiler.source
 maven.compiler.target1.7/maven.compiler.target
 
project.reporting.outputEncoding${project.build.sourceEncoding}/project.reporting.outputEncoding
+commons.compress.version1.9/commons.compress.version
 commons.io.version2.4/commons.io.version
 slf4j.version1.7.12/slf4j.version
   /properties

Modified: tika/trunk/tika-parsers/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1696833r1=1696832r2=1696833view=diff
==
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Thu Aug 20 18:08:44 2015
@@ -36,11 +36,10 @@
 
   properties
 poi.version3.13-beta1/poi.version
+!-- NOTE: sync codec version with POI --
 codec.version1.9/codec.version
-!-- NOTE: sync with POI --
-compress.version1.9/compress.version
+!-- NOTE: sync tukaani version with commons-compress --
 tukaani.version1.5/tukaani.version
-!-- NOTE: sync with commons-compress --
 mime4j.version0.7.2/mime4j.version
 vorbis.version0.6/vorbis.version
 pdfbox.version1.8.10/pdfbox.version
@@ -121,7 +120,7 @@
 dependency
   groupIdorg.apache.commons/groupId
   artifactIdcommons-compress/artifactId
-  version${compress.version}/version
+  version${commons.compress.version}/version
 /dependency
 dependency
   groupIdorg.tukaani/groupId




svn commit: r1696856 - /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java

2015-08-20 Thread nick
Author: nick
Date: Thu Aug 20 21:11:19 2015
New Revision: 1696856

URL: http://svn.apache.org/r1696856
Log:
One more format to add support for

Modified:

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java?rev=1696856r1=1696855r2=1696856view=diff
==
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
 Thu Aug 20 21:11:19 2015
@@ -71,6 +71,7 @@ public class CompressorParser extends Ab
 static MediaType getMediaType(CompressorInputStream stream) {
 // TODO Add support for the remaining CompressorInputStream formats:
 //   LZMACompressorInputStream
+//   LZWInputStream - UnshrinkingInputStream
 if (stream instanceof BZip2CompressorInputStream) {
 return BZIP2;
 } else if (stream instanceof GzipCompressorInputStream) {




svn commit: r1696862 - /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java

2015-08-20 Thread nick
Author: nick
Date: Thu Aug 20 21:46:28 2015
New Revision: 1696862

URL: http://svn.apache.org/r1696862
Log:
Bring in line with other parsers with special InputStream requirements, by 
using TikaInputStream TIKA-1710

Modified:

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java?rev=1696862r1=1696861r2=1696862view=diff
==
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
 Thu Aug 20 21:46:28 2015
@@ -21,11 +21,11 @@ import java.io.InputStream;
 import java.util.Collections;
 import java.util.Set;
 
-import org.apache.commons.io.input.TaggedInputStream;
 import org.apache.james.mime4j.MimeException;
 import org.apache.james.mime4j.parser.MimeStreamParser;
 import org.apache.james.mime4j.stream.MimeConfig;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
@@ -44,7 +44,6 @@ import org.xml.sax.SAXException;
  * @author jnio...@digitalpebble.com
  */
 public class RFC822Parser extends AbstractParser {
-
 /**
  * Serial version UID
  */
@@ -73,13 +72,12 @@ public class RFC822Parser extends Abstra
 xhtml, metadata, context, config.isStrictParsing());
 parser.setContentHandler(mch);
 parser.setContentDecoding(true);
-TaggedInputStream tagged = stream instanceof TaggedInputStream
-? (TaggedInputStream)stream
-: new TaggedInputStream(stream);
+
+TikaInputStream tstream = TikaInputStream.get(stream);
 try {
-parser.parse(tagged);
+parser.parse(tstream);
 } catch (IOException e) {
-tagged.throwIfCauseOf(e);
+tstream.throwIfCauseOf(e);
 throw new TikaException(Failed to parse an email message, e);
 } catch (MimeException e) {
 // Unwrap the exception in case it was not thrown by mime4j




svn commit: r1696859 - /tika/trunk/tika-parsers/pom.xml

2015-08-20 Thread nick
Author: nick
Date: Thu Aug 20 21:38:45 2015
New Revision: 1696859

URL: http://svn.apache.org/r1696859
Log:
TIKA-1710 Guava is no longer required, we have StandardCharsets instead now

Modified:
tika/trunk/tika-parsers/pom.xml

Modified: tika/trunk/tika-parsers/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1696859r1=1696858r2=1696859view=diff
==
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Thu Aug 20 21:38:45 2015
@@ -327,11 +327,6 @@
   artifactIdhttpservices/artifactId
   version${netcdf-java.version}/version
 /dependency
-dependency
-  groupIdcom.google.guava/groupId
-  artifactIdguava/artifactId
-  version11.0.2/version
-/dependency
 !-- Apache Commons CSV --
 dependency
   groupIdorg.apache.commons/groupId




svn commit: r1696860 - /tika/trunk/CHANGES.txt

2015-08-20 Thread nick
Author: nick
Date: Thu Aug 20 21:40:12 2015
New Revision: 1696860

URL: http://svn.apache.org/r1696860
Log:
Changelog update

Modified:
tika/trunk/CHANGES.txt

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1696860r1=1696859r2=1696860view=diff
==
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Thu Aug 20 21:40:12 2015
@@ -12,6 +12,10 @@ Release 1.11 - Current Development
   * Corrected Tika Config XML detector defintion explicit loading 
 of MimeTypes (TIKA-1708)
 
+  * In Tika Parsers, Batch, Server, App and Examples, use Apache
+Commons IO instead of inlined ex-Commons classes, and the Java 7
+Standard Charset definitions (TIKA-1710)
+
   * Upgraded to Commons Compress 1.10, which enables zlib compressed
 archives support (TIKA-1718)
 




svn commit: r1696609 - in /tika/site/src/site/apt: 1.10/configuring.apt 1.11/configuring.apt

2015-08-19 Thread nick
Author: nick
Date: Wed Aug 19 15:04:22 2015
New Revision: 1696609

URL: http://svn.apache.org/r1696609
Log:
Fix APT markup for service loader config documentation

Modified:
tika/site/src/site/apt/1.10/configuring.apt
tika/site/src/site/apt/1.11/configuring.apt

Modified: tika/site/src/site/apt/1.10/configuring.apt
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/1.10/configuring.apt?rev=1696609r1=1696608r2=1696609view=diff
==
--- tika/site/src/site/apt/1.10/configuring.apt (original)
+++ tika/site/src/site/apt/1.10/configuring.apt Wed Aug 19 15:04:22 2015
@@ -139,6 +139,9 @@ Configuring Tika
 While the work on that is ongoing, for now you will need to review the
 {{{./api/}Tika Javadocs}} to see how individual Translators are configured.
 
+~~ When Translators can have their parameters configured, mention here about
+~~ specifying which single one to use in the Tika Config XML
+
 * {Configuring the Service Loader}
 
 Tika has a number of service provider types such as parsers, detectors, 
and translators.  
@@ -149,12 +152,14 @@ Configuring Tika
 
 The ServiceLoader's registry can be populated either statically or 
dynamically.
 
-Static
+** Static
+
 Static loading is the default which requires no configuration.  This 
configuration options is used in
 Tika deployments where the Tika JAR files reside together in the same 
classloader hierarchy.  The services 
 provides are loaded from provider configuration files located within the 
tika-parsers JAR file at META-INF/services.
 
-Dynamic
+** Dynamic
+
 Dynamic loading may be required if the tika service providers will reside 
in different classloaders such as 
 in OSGi.  To allow a provider created in tika-config.xml to utilize 
dynamically loaded services you need to 
 configure the ServiceLoader to be dynamic with the following configuration:
@@ -166,15 +171,22 @@ Configuring Tika
 /properties
 ---
 
+** Load Error Handling
+
 The ServiceLoader can contains a handler to deal with errors that occur 
during provider initialization.  For example
 if a class fails to initialize LoadErrorHandler deals with the exception 
that is thrown.
 This handler can be configured to:
 
-IGNORE - (Default) Do nothing when providers fail to initialize.
-WARN   - Log a warning when providers fail to initialize.
-THROW  - Throw an exception when providers fail to initialize.
+*  IGNORE  - (Default) Do nothing when providers fail to initialize.
+
+*  WARN- Log a warning when providers fail to initialize.
+
+*  THROW   - Throw an exception when providers fail to initialize.
+
+[]
+
+For example to set the LoadErrorHandler to WARN then use the following 
configuration:
 
-For example to set the LoadErrorHandler to WARN then use the following 
configuration:
 ---
 properties
   service-loader loadErrorHandler=WARN/
@@ -182,9 +194,6 @@ For example to set the LoadErrorHandler
 /properties
 ---
 
-~~ When Translators can have their parameters configured, mention here about
-~~ specifying which single one to use in the Tika Config XML
-
 * {Using a Tika Configuration XML file}
 
 However you call Tika, the System Property of  tika.config  is

Modified: tika/site/src/site/apt/1.11/configuring.apt
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/1.11/configuring.apt?rev=1696609r1=1696608r2=1696609view=diff
==
--- tika/site/src/site/apt/1.11/configuring.apt (original)
+++ tika/site/src/site/apt/1.11/configuring.apt Wed Aug 19 15:04:22 2015
@@ -139,6 +139,9 @@ Configuring Tika
 While the work on that is ongoing, for now you will need to review the
 {{{./api/}Tika Javadocs}} to see how individual Translators are configured.
 
+~~ When Translators can have their parameters configured, mention here about
+~~ specifying which single one to use in the Tika Config XML
+
 * {Configuring the Service Loader}
 
 Tika has a number of service provider types such as parsers, detectors, 
and translators.  
@@ -149,12 +152,14 @@ Configuring Tika
 
 The ServiceLoader's registry can be populated either statically or 
dynamically.
 
-Static
+** Static
+
 Static loading is the default which requires no configuration.  This 
configuration options is used in
 Tika deployments where the Tika JAR files reside together in the same 
classloader hierarchy.  The services 
 provides are loaded from provider configuration files located within the 
tika-parsers JAR file at META-INF/services.
 
-Dynamic
+** Dynamic
+
 Dynamic loading may be required if the tika service providers will reside 
in different classloaders such as 
 in OSGi.  To allow a provider created in tika-config.xml to utilize 
dynamically loaded services you need to 
 configure

svn commit: r1696610 - in /tika/site/publish: 1.10/configuring.html 1.11/configuring.html

2015-08-19 Thread nick
Author: nick
Date: Wed Aug 19 15:06:01 2015
New Revision: 1696610

URL: http://svn.apache.org/r1696610
Log:
Republish the site

Modified:
tika/site/publish/1.10/configuring.html
tika/site/publish/1.11/configuring.html

Modified: tika/site/publish/1.10/configuring.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.10/configuring.html?rev=1696610r1=1696609r2=1696610view=diff
==
--- tika/site/publish/1.10/configuring.html (original)
+++ tika/site/publish/1.10/configuring.html Wed Aug 19 15:06:01 2015
@@ -96,7 +96,12 @@
 lia href=#Configuring_Mime_TypesConfiguring Mime Types/a/li
 lia href=#Configuring_Language_IdentifiersConfiguring Language 
Identifiers/a/li
 lia href=#Configuring_TranslatorsConfiguring Translators/a/li
-lia href=#Configuring_the_Service_LoaderConfiguring the Service 
Loader/a/li/ul/li/ul
+lia href=#Configuring_the_Service_LoaderConfiguring the Service 
Loader/a
+ul
+lia href=#StaticStatic/a/li
+lia href=#DynamicDynamic/a/li
+lia href=#Load_Error_HandlingLoad Error Handling/a/li/ul/li
+lia href=#Using_a_Tika_Configuration_XML_fileUsing a Tika Configuration 
XML file/a/li/ul/li/ul
 div class=section
 h3a name=Configuring_ParsersConfiguring Parsers/a/h3
 pThrough the Tika Config xml, it is possible to have a high degree of 
control over which parsers are or aren't used, in what order of preferences 
etc. It is also possible to override just certain parts, to (for example) have 
quot;default except for PDFquot;./p
@@ -156,22 +161,35 @@
 pAt this time, there is no unified way to configure language identifiers. 
While the work on that is ongoing, for now you will need to review the a 
href=./api/Tika Javadocs/a to see how individual identifiers are 
configured./p/div
 div class=section
 h3a name=Configuring_TranslatorsConfiguring Translators/a/h3
-pAt this time, there is no unified way to configure Translators. While the 
work on that is ongoing, for now you will need to review the a 
href=./api/Tika Javadocs/a to see how individual Translators are 
configured./p/div
+pAt this time, there is no unified way to configure Translators. While the 
work on that is ongoing, for now you will need to review the a 
href=./api/Tika Javadocs/a to see how individual Translators are 
configured./p!-- When Translators can have their parameters configured, 
mention here about --!-- specifying which single one to use in the Tika 
Config XML --/div
 div class=section
 h3a name=Configuring_the_Service_LoaderConfiguring the Service 
Loader/a/h3
 pTika has a number of service provider types such as parsers, detectors, and 
translators. The a 
href=./api/org/apache/tika/config/ServiceLoader.htmlorg.apache.tika.config.ServiceLoader/a
 class provides a registry of each type of provider. This allows Tika to create 
implementations such as a 
href=./api/org/apache/tika/parser/DefaultParser.htmlorg.apache.tika.parser.DefaultParser/a,
 a 
href=./api/org/apache/tika/language/translate/DefaultTranslator.htmlorg.apache.tika.language.translate.DefaultTranslator/a,
 and a 
href=./api/org/apache/tika/detect/DefaultDetector.htmlorg.apache.tika.detect.DefaultDetector/a
 that can match the appropriate provider to an incoming piece of content./p
 pThe ServiceLoader's registry can be populated either statically or 
dynamically./p
-pStatic Static loading is the default which requires no configuration. This 
configuration options is used in Tika deployments where the Tika JAR files 
reside together in the same classloader hierarchy. The services provides are 
loaded from provider configuration files located within the tika-parsers JAR 
file at META-INF/services./p
-pDynamic Dynamic loading may be required if the tika service providers will 
reside in different classloaders such as in OSGi. To allow a provider created 
in tika-config.xml to utilize dynamically loaded services you need to configure 
the ServiceLoader to be dynamic with the following configuration:/p
+div class=section
+h4Statica name=Static/a/h4
+pStatic loading is the default which requires no configuration. This 
configuration options is used in Tika deployments where the Tika JAR files 
reside together in the same classloader hierarchy. The services provides are 
loaded from provider configuration files located within the tika-parsers JAR 
file at META-INF/services./p/div
+div class=section
+h4Dynamica name=Dynamic/a/h4
+pDynamic loading may be required if the tika service providers will reside 
in different classloaders such as in OSGi. To allow a provider created in 
tika-config.xml to utilize dynamically loaded services you need to configure 
the ServiceLoader to be dynamic with the following configuration:/p
 div
 prelt;propertiesgt;
   lt;service-loader dynamic=quot;truequot;/gt;
   
-lt;/propertiesgt;/pre/div
-pThe ServiceLoader can contains a handler to deal with errors that occur 
during provider initialization. For example if a class fails to initialize

svn commit: r1696605 - in /tika/site/src/site/apt/1.11: ./ configuring.apt examples.apt formats.apt

2015-08-19 Thread nick
Author: nick
Date: Wed Aug 19 14:53:30 2015
New Revision: 1696605

URL: http://svn.apache.org/r1696605
Log:
Start on the 1.11 docs, for the pieces that need updating during development

Added:
tika/site/src/site/apt/1.11/
tika/site/src/site/apt/1.11/configuring.apt
tika/site/src/site/apt/1.11/examples.apt
  - copied unchanged from r1696597, tika/site/src/site/apt/1.10/examples.apt
tika/site/src/site/apt/1.11/formats.apt
  - copied, changed from r1696597, tika/site/src/site/apt/1.10/formats.apt

Added: tika/site/src/site/apt/1.11/configuring.apt
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/1.11/configuring.apt?rev=1696605view=auto
==
--- tika/site/src/site/apt/1.11/configuring.apt (added)
+++ tika/site/src/site/apt/1.11/configuring.apt Wed Aug 19 14:53:30 2015
@@ -0,0 +1,214 @@
+  
+  Configuring Tika
+  
+
+~~ Licensed to the Apache Software Foundation (ASF) under one or more
+~~ contributor license agreements.  See the NOTICE file distributed with
+~~ this work for additional information regarding copyright ownership.
+~~ The ASF licenses this file to You under the Apache License, Version 2.0
+~~ (the License); you may not use this file except in compliance with
+~~ the License.  You may obtain a copy of the License at
+~~
+~~ http://www.apache.org/licenses/LICENSE-2.0
+~~
+~~ Unless required by applicable law or agreed to in writing, software
+~~ distributed under the License is distributed on an AS IS BASIS,
+~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+~~ See the License for the specific language governing permissions and
+~~ limitations under the License.
+
+Configuring Tika
+
+   Out of the box, Apache Tika will attempt to start with all available
+   Detectors and Parsers, running with sensible defaults. For most users,
+   this default configuration will work well.
+
+   This page gives you information on how to configure the various
+   components of Apache Tika, such as Parsers and Detectors, if you need
+   fine-grained control over ordering, exclusions and the like.
+
+%{toc|section=1|fromDepth=1}
+
+* {Configuring Parsers}
+
+Through the Tika Config xml, it is possible to have a high degree of 
control
+over which parsers are or aren't used, in what order of preferences etc. 
It 
+is also possible to override just certain parts, to (for example) have 
default
+except for PDF.
+
+Currently, it is only possible to have a single parser run against a 
document.
+There is on-going discussion around fallback parsers and combining the 
output
+of multiple parsers running on a document, but none of these are available 
yet.
+
+To override some parser certain default behaviours, include the {{{ 
DefaultParser }}}
+in your configuration, with excludes, then add other parser definitions in.
+To prevent the {{{ DefaultParser }}} (with its auto-discovery) being used, 
+simply omit it from your config, and list all other parsers you want 
instead.
+
+To override just some default behaviour, you can use a Tika Config 
something
+like this:
+
+---
+?xml version=1.0 encoding=UTF-8?
+properties
+  parsers
+!-- Default Parser for most things, except for 2 mime types, and never
+ use the Executable Parser --
+parser class=org.apache.tika.parser.DefaultParser
+  mime-excludeimage/jpeg/mime-exclude
+  mime-excludeapplication/pdf/mime-exclude
+  parser-exclude 
class=org.apache.tika.parser.executable.ExecutableParser/
+/parser
+!-- Use a different parser for PDF --
+parser class=org.apache.tika.parser.EmptyParser
+  mimeapplication/pdf/mime
+/parser
+  /parsers
+/properties
+---
+
+To configure things in code, the key classes to use to build up your own 
custom 
+parser heirarchy are 
+
{{{./api/org/apache/tika/parser/DefaultParser.html}org.apache.tika.parser.DefaultParser}},
+
{{{./api/org/apache/tika/parser/CompositeParser.html}org.apache.tika.parser.CompositeParser}}
+and
+
{{{./api/org/apache/tika/parser/ParserDecorator.html}org.apache.tika.parser.ParserDecorator}}.
+
+* {Configuring Detectors}
+
+Through the Tika Config xml, it is possible to have a high degree of 
control
+over which detectors are or aren't used, in what order of preferences etc. 
It 
+is also possible to override just certain parts, to (for example) have 
default
+except for no POIFS Container Detction.
+
+To override some detector certain default behaviours, include the 
+{{{ DefaultDetector }}}, with any {{{ detector-exclude }}} entries you 
need,
+in your configuration, then add other detectors definitions in. To prevent 
+the {{{ DefaultParser }}} (with its auto-discovery) being used, simply 
omit it 
+from your config, and list all

svn commit: r1696322 - /tika/trunk/CHANGES.txt

2015-08-17 Thread nick
Author: nick
Date: Mon Aug 17 18:10:09 2015
New Revision: 1696322

URL: http://svn.apache.org/r1696322
Log:
Changelog update

Modified:
tika/trunk/CHANGES.txt

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1696322r1=1696321r2=1696322view=diff
==
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Mon Aug 17 18:10:09 2015
@@ -9,6 +9,9 @@ Release 1.11 - Current Development
 
   * Upgraded to ASM 5.0.4 (TIKA-1705).
 
+  * Corrected Tika Config XML detector defintion explicit loading 
+of MimeTypes (TIKA-1708)
+
 
 Release 1.10 - 8/1/2015
 




svn commit: r1696159 - in /tika/trunk: tika-core/src/test/java/org/apache/tika/config/ tika-example/ tika-parsers/src/main/java/org/apache/tika/parser/mbox/ tika-parsers/src/test/java/org/apache/tika/

2015-08-16 Thread nick
Author: nick
Date: Sun Aug 16 18:00:57 2015
New Revision: 1696159

URL: http://svn.apache.org/r1696159
Log:
Outlook detection with custom config tests, based on work by Justin Palmer 
TIKA-1708

Added:

tika/trunk/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1708-detector-composite.xml

tika/trunk/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1708-detector-default.xml
Modified:

tika/trunk/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java
tika/trunk/tika-example/pom.xml

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java

Modified: 
tika/trunk/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java?rev=1696159r1=1696158r2=1696159view=diff
==
--- 
tika/trunk/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java
 (original)
+++ 
tika/trunk/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java
 Sun Aug 16 18:00:57 2015
@@ -20,6 +20,7 @@ import static org.junit.Assert.assertNot
 
 import java.net.URL;
 
+import org.apache.tika.TikaTest;
 import org.apache.tika.parser.ParseContext;
 import org.junit.After;
 
@@ -29,7 +30,7 @@ import org.junit.After;
  *  that {@link TikaConfigTest} can't, do due to a need for the
  *  full set of real classes of parsers / detectors
  */
-public abstract class AbstractTikaConfigTest {
+public abstract class AbstractTikaConfigTest extends TikaTest {
 protected static ParseContext context = new ParseContext();
 
 protected static String getConfigPath(String config) throws Exception {

Modified: tika/trunk/tika-example/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-example/pom.xml?rev=1696159r1=1696158r2=1696159view=diff
==
--- tika/trunk/tika-example/pom.xml (original)
+++ tika/trunk/tika-example/pom.xml Sun Aug 16 18:00:57 2015
@@ -64,6 +64,13 @@
 /dependency
 dependency
   groupIdorg.apache.tika/groupId
+  artifactIdtika-core/artifactId
+  version${project.version}/version
+  typetest-jar/type
+  scopetest/scope
+/dependency
+dependency
+  groupIdorg.apache.tika/groupId
   artifactIdtika-parsers/artifactId
   version${project.version}/version
   typetest-jar/type

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java?rev=1696159r1=1696158r2=1696159view=diff
==
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
 Sun Aug 16 18:00:57 2015
@@ -46,14 +46,13 @@ import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
 /**
- * @author Tran Nam Quang
- * @author hong-thai.nguyen
+ * Parser for MS Outlook PST email storage files
  */
 public class OutlookPSTParser extends AbstractParser {
 
 private static final long serialVersionUID = 620998217748364063L;
 
-private static final MediaType MS_OUTLOOK_PST_MIMETYPE = 
MediaType.application(vnd.ms-outlook-pst);
+public static final MediaType MS_OUTLOOK_PST_MIMETYPE = 
MediaType.application(vnd.ms-outlook-pst);
 private static final SetMediaType SUPPORTED_TYPES = 
singleton(MS_OUTLOOK_PST_MIMETYPE);
 
 private static AttributesImpl createAttribute(String attName, String 
attValue) {

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java?rev=1696159r1=1696158r2=1696159view=diff
==
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
 Sun Aug 16 18:00:57 2015
@@ -25,8 +25,12 @@ import org.apache.tika.detect.CompositeD
 import org.apache.tika.detect.DefaultDetector;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.detect.EmptyDetector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.mbox.OutlookPSTParser;
 import org.apache.tika.parser.microsoft.POIFSContainerDetector;
 import org.apache.tika.parser.pkg.ZipContainerDetector;
+import

svn commit: r1696158 - in /tika/trunk: tika-core/src/test/java/org/apache/tika/TikaTest.java tika-parsers/src/test/java/org/apache/tika/TikaTest.java

2015-08-16 Thread nick
Author: nick
Date: Sun Aug 16 17:58:55 2015
New Revision: 1696158

URL: http://svn.apache.org/r1696158
Log:
Move the parent test class of many Tika tests to core/test, so core tests can 
use it too

Added:
tika/trunk/tika-core/src/test/java/org/apache/tika/TikaTest.java
  - copied unchanged from r1696139, 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
Removed:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java



svn commit: r1696160 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/config/TikaConfig.java tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java

2015-08-16 Thread nick
Author: nick
Date: Sun Aug 16 18:35:26 2015
New Revision: 1696160

URL: http://svn.apache.org/r1696160
Log:
TIKA-1708 If the Tika Config detector entry calls for MimeTypes, use the 
already created one, avoid creating a new empty one

Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1696160r1=1696159r2=1696160view=diff
==
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
Sun Aug 16 18:35:26 2015
@@ -434,6 +434,8 @@ public class TikaConfig {
 abstract Class? extends T getLoaderClass(); // Generics workaround
 abstract boolean isComposite(T loaded);
 abstract boolean isComposite(Class? extends T loadedClass);
+abstract T preLoadOne(Class? extends T loadedClass, String 
classname, 
+MimeTypes mimeTypes) throws TikaException;
 abstract CT createDefault(MimeTypes mimeTypes, ServiceLoader loader);
 abstract CT createComposite(ListT loaded, MimeTypes mimeTypes, 
ServiceLoader loader);
 abstract T createComposite(Class? extends T compositeClass, 
@@ -479,15 +481,11 @@ public class TikaConfig {
 try {
 Class? extends T loadedClass =
 loader.getServiceClass(getLoaderClass(), name);
-
-// Check for classes which can't be set in config
-if (AutoDetectParser.class.isAssignableFrom(loadedClass)) {
-// https://issues.apache.org/jira/browse/TIKA-866
-throw new TikaException(
-AutoDetectParser not supported in a parser
-+  configuration element:  + name);
-}
 
+// Do pre-load checks and short-circuits
+loaded = preLoadOne(loadedClass, name, mimeTypes);
+if (loaded != null) return loaded;
+
 // Is this a composite or decorated class? If so, support 
recursion
 if (isComposite(loadedClass)) {
 // Get the child objects for it
@@ -562,6 +560,19 @@ public class TikaConfig {
 return Parser.class;
 }
 @Override
+Parser preLoadOne(Class? extends Parser loadedClass, String 
classname, 
+  MimeTypes mimeTypes) throws TikaException {
+// Check for classes which can't be set in config
+if (AutoDetectParser.class.isAssignableFrom(loadedClass)) {
+// https://issues.apache.org/jira/browse/TIKA-866
+throw new TikaException(
+AutoDetectParser not supported in a parser
++  configuration element:  + classname);
+}
+// Continue with normal loading
+return null;
+}
+@Override
 boolean isComposite(Parser loaded) {
 return loaded instanceof CompositeParser;
 }
@@ -657,6 +668,17 @@ public class TikaConfig {
 return Detector.class;
 }
 @Override
+Detector preLoadOne(Class? extends Detector loadedClass, String 
classname, 
+MimeTypes mimeTypes) throws TikaException {
+// If they asked for the mime types as a detector, give
+//  them the one we've already created. TIKA-1708
+if (MimeTypes.class.equals(loadedClass)) {
+return mimeTypes;
+}
+// Continue with normal loading
+return null;
+}
+@Override
 boolean isComposite(Detector loaded) {
 return loaded instanceof CompositeDetector;
 }
@@ -728,6 +750,12 @@ public class TikaConfig {
 return Translator.class;
 }
 @Override
+Translator preLoadOne(Class? extends Translator loadedClass, String 
classname, 
+  MimeTypes mimeTypes) throws TikaException {
+// Continue with normal loading
+return null;
+}
+@Override
 boolean isComposite(Translator loaded) { return false; }
 @Override
 boolean isComposite(Class? extends Translator loadedClass) { return 
false; }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java?rev=1696160r1=1696159r2=1696160view=diff

svn commit: r1696054 - in /tika/trunk/tika-parsers: ./ src/main/java/org/apache/tika/parser/journal/ src/main/resources/META-INF/services/ src/main/resources/org/apache/tika/parser/journal/ src/test/j

2015-08-15 Thread nick
Author: nick
Date: Sat Aug 15 14:57:54 2015
New Revision: 1696054

URL: http://svn.apache.org/r1696054
Log:
Back out r1695816, so the build can pass again, pending a fix of the broken 
grobid poms. Fix being tracked in TIKA-1699

Removed:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/
tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/journal/
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/journal/

tika/trunk/tika-parsers/src/test/resources/test-documents/testJournalParser.pdf
Modified:
tika/trunk/tika-parsers/pom.xml

tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser

Modified: tika/trunk/tika-parsers/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1696054r1=1696053r2=1696054view=diff
==
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Sat Aug 15 14:57:54 2015
@@ -232,14 +232,6 @@
   version0.7/version
 /dependency
 
-   !--  GROBID Dependencies --
-   dependency
- groupIdorg.grobid/groupId
- artifactIdgrobid-core/artifactId
- version0.3.4/version
-   /dependency   
-   
-
 !-- Provided dependencies --
 dependency
   groupIdorg.xerial/groupId

Modified: 
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1696054r1=1696053r2=1696054view=diff
==
--- 
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 (original)
+++ 
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 Sat Aug 15 14:57:54 2015
@@ -66,4 +66,3 @@ org.apache.tika.parser.isatab.ISArchiveP
 org.apache.tika.parser.geoinfo.GeographicInformationParser
 org.apache.tika.parser.geo.topic.GeoParser
 org.apache.tika.parser.external.CompositeExternalParser
-org.apache.tika.parser.journal.JournalParser
\ No newline at end of file




svn commit: r1694958 - /tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java

2015-08-10 Thread nick
Author: nick
Date: Mon Aug 10 06:14:43 2015
New Revision: 1694958

URL: http://svn.apache.org/r1694958
Log:
Fix indents/whitespace

Modified:

tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java

Modified: 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java?rev=1694958r1=1694957r2=1694958view=diff
==
--- 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java 
(original)
+++ 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java 
Mon Aug 10 06:14:43 2015
@@ -37,43 +37,41 @@ import org.xml.sax.ContentHandler;
 
 @SuppressWarnings(deprecation)
 public class MyFirstTika {
-
-   public static void main(String[] args) throws Exception {
-   String filename = args[0];
-   MimeTypes mimeRegistry = TikaConfig.getDefaultConfig()
-   .getMimeRepository();
-
-   System.out.println(Examining: [ + filename + ]);
-
-   System.out.println(The MIME type (based on filename) is: [
-   + mimeRegistry.getMimeType(filename) + ]);
-
-   System.out.println(The MIME type (based on MAGIC) is: [
-   + mimeRegistry.getMimeType(new File(filename)) 
+ ]);
-
-   Detector mimeDetector = (Detector) mimeRegistry;
-   System.out
-   .println(The MIME type (based on the Detector 
interface) is: [
-   + mimeDetector.detect(new 
File(filename).toURI().toURL()
-   .openStream(), 
new Metadata()) + ]);
-
-   LanguageIdentifier lang = new LanguageIdentifier(new 
LanguageProfile(
-   FileUtils.readFileToString(new 
File(filename;
-
-   System.out.println(The language of this content is: [
-   + lang.getLanguage() + ]);
-
-   Parser parser = TikaConfig.getDefaultConfig().getParser(
-   
MediaType.parse(mimeRegistry.getMimeType(filename).getName()));
-   Metadata parsedMet = new Metadata();
-   ContentHandler handler = new BodyContentHandler();
-   parser.parse(new File(filename).toURI().toURL().openStream(), 
handler,
-   parsedMet, new ParseContext());
-
-   System.out.println(Parsed Metadata: );
-   System.out.println(parsedMet);
-   System.out.println(Parsed Text: );
-   System.out.println(handler.toString());
-
-   }
+public static void main(String[] args) throws Exception {
+String filename = args[0];
+MimeTypes mimeRegistry = TikaConfig.getDefaultConfig()
+.getMimeRepository();
+
+System.out.println(Examining: [ + filename + ]);
+
+System.out.println(The MIME type (based on filename) is: [
++ mimeRegistry.getMimeType(filename) + ]);
+
+System.out.println(The MIME type (based on MAGIC) is: [
++ mimeRegistry.getMimeType(new File(filename)) + ]);
+
+Detector mimeDetector = (Detector) mimeRegistry;
+System.out
+.println(The MIME type (based on the Detector interface) is: [
++ mimeDetector.detect(new File(filename).toURI().toURL()
+.openStream(), new Metadata()) + ]);
+
+LanguageIdentifier lang = new LanguageIdentifier(new LanguageProfile(
+FileUtils.readFileToString(new File(filename;
+
+System.out.println(The language of this content is: [
++ lang.getLanguage() + ]);
+
+Parser parser = TikaConfig.getDefaultConfig().getParser(
+MediaType.parse(mimeRegistry.getMimeType(filename).getName()));
+Metadata parsedMet = new Metadata();
+ContentHandler handler = new BodyContentHandler();
+parser.parse(new File(filename).toURI().toURL().openStream(), handler,
+parsedMet, new ParseContext());
+
+System.out.println(Parsed Metadata: );
+System.out.println(parsedMet);
+System.out.println(Parsed Text: );
+System.out.println(handler.toString());
+}
 }




svn commit: r1694961 - /tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java

2015-08-10 Thread nick
Author: nick
Date: Mon Aug 10 06:24:57 2015
New Revision: 1694961

URL: http://svn.apache.org/r1694961
Log:
Several people on StackOverflow are getting confused by this example, show how 
to use AutoDetectParser first, all the components second

Modified:

tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java

Modified: 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java?rev=1694961r1=1694960r2=1694961view=diff
==
--- 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java 
(original)
+++ 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java 
Mon Aug 10 06:24:57 2015
@@ -19,11 +19,13 @@ import java.io.File;
 import org.apache.commons.io.FileUtils;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.language.LanguageIdentifier;
 import org.apache.tika.language.LanguageProfile;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
@@ -33,14 +35,45 @@ import org.xml.sax.ContentHandler;
  * Demonstrates how to call the different components within Tika: its
  * {@link Detector} framework (aka MIME identification and repository), its
  * {@link Parser} interface, its {@link LanguageIdentifier} and other goodies.
+ * It also shows the easy way via {@link AutoDetectParser}
  */
 
 @SuppressWarnings(deprecation)
 public class MyFirstTika {
 public static void main(String[] args) throws Exception {
 String filename = args[0];
-MimeTypes mimeRegistry = TikaConfig.getDefaultConfig()
-.getMimeRepository();
+TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
+
+Metadata metadata = new Metadata();
+String text = parseUsingComponents(filename, tikaConfig, metadata);
+System.out.println(Parsed Metadata: );
+System.out.println(metadata);
+System.out.println(Parsed Text: );
+System.out.println(text);
+
+System.out.println(-);
+
+metadata = new Metadata();
+text = parseUsingAutoDetect(filename, tikaConfig, metadata);
+System.out.println(Parsed Metadata: );
+System.out.println(metadata);
+System.out.println(Parsed Text: );
+System.out.println(text);
+}
+
+public static String parseUsingAutoDetect(String filename, TikaConfig 
tikaConfig, 
+Metadata metadata) throws Exception {
+System.out.println(Handling using AutoDetectParser: [ + filename + 
]);
+
+AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+ContentHandler handler = new BodyContentHandler();
+TikaInputStream stream = TikaInputStream.get(new File(filename));
+parser.parse(stream, handler, metadata, new ParseContext());
+return handler.toString();
+}
+public static String parseUsingComponents(String filename, TikaConfig 
tikaConfig, 
+Metadata metadata) throws Exception {
+MimeTypes mimeRegistry = tikaConfig.getMimeRepository();
 
 System.out.println(Examining: [ + filename + ]);
 
@@ -51,8 +84,7 @@ public class MyFirstTika {
 + mimeRegistry.getMimeType(new File(filename)) + ]);
 
 Detector mimeDetector = (Detector) mimeRegistry;
-System.out
-.println(The MIME type (based on the Detector interface) is: [
+System.out.println(The MIME type (based on the Detector interface) 
is: [
 + mimeDetector.detect(new File(filename).toURI().toURL()
 .openStream(), new Metadata()) + ]);
 
@@ -62,16 +94,12 @@ public class MyFirstTika {
 System.out.println(The language of this content is: [
 + lang.getLanguage() + ]);
 
-Parser parser = TikaConfig.getDefaultConfig().getParser(
+Parser parser = tikaConfig.getParser(
 MediaType.parse(mimeRegistry.getMimeType(filename).getName()));
-Metadata parsedMet = new Metadata();
 ContentHandler handler = new BodyContentHandler();
 parser.parse(new File(filename).toURI().toURL().openStream(), handler,
-parsedMet, new ParseContext());
-
-System.out.println(Parsed Metadata: );
-System.out.println(parsedMet);
-System.out.println(Parsed Text: );
-System.out.println(handler.toString());
+metadata, new ParseContext());
+
+return handler.toString();
 }
 }




svn commit: r1694962 - /tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java

2015-08-10 Thread nick
Author: nick
Date: Mon Aug 10 06:33:51 2015
New Revision: 1694962

URL: http://svn.apache.org/r1694962
Log:
Replace deprecated method use and outdated practice from the example

Modified:

tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java

Modified: 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java?rev=1694962r1=1694961r2=1694962view=diff
==
--- 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java 
(original)
+++ 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java 
Mon Aug 10 06:33:51 2015
@@ -15,6 +15,7 @@
 package org.apache.tika.example;
 
 import java.io.File;
+import java.io.InputStream;
 
 import org.apache.commons.io.FileUtils;
 import org.apache.tika.config.TikaConfig;
@@ -35,10 +36,9 @@ import org.xml.sax.ContentHandler;
  * Demonstrates how to call the different components within Tika: its
  * {@link Detector} framework (aka MIME identification and repository), its
  * {@link Parser} interface, its {@link LanguageIdentifier} and other goodies.
+ * 
  * It also shows the easy way via {@link AutoDetectParser}
  */
-
-@SuppressWarnings(deprecation)
 public class MyFirstTika {
 public static void main(String[] args) throws Exception {
 String filename = args[0];
@@ -77,16 +77,18 @@ public class MyFirstTika {
 
 System.out.println(Examining: [ + filename + ]);
 
+metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
 System.out.println(The MIME type (based on filename) is: [
-+ mimeRegistry.getMimeType(filename) + ]);
++ mimeRegistry.detect(null, metadata) + ]);
 
+InputStream stream = TikaInputStream.get(new File(filename));
 System.out.println(The MIME type (based on MAGIC) is: [
-+ mimeRegistry.getMimeType(new File(filename)) + ]);
++ mimeRegistry.detect(stream, metadata) + ]);
 
-Detector mimeDetector = (Detector) mimeRegistry;
+stream = TikaInputStream.get(new File(filename));
+Detector detector = tikaConfig.getDetector();
 System.out.println(The MIME type (based on the Detector interface) 
is: [
-+ mimeDetector.detect(new File(filename).toURI().toURL()
-.openStream(), new Metadata()) + ]);
++ detector.detect(stream, metadata) + ]);
 
 LanguageIdentifier lang = new LanguageIdentifier(new LanguageProfile(
 FileUtils.readFileToString(new File(filename;
@@ -94,11 +96,14 @@ public class MyFirstTika {
 System.out.println(The language of this content is: [
 + lang.getLanguage() + ]);
 
-Parser parser = tikaConfig.getParser(
-MediaType.parse(mimeRegistry.getMimeType(filename).getName()));
+// Get a non-detecting parser that handles all the types it can
+Parser parser = tikaConfig.getParser();
+// Tell it what we think the content is
+MediaType type = detector.detect(stream, metadata);
+metadata.set(Metadata.CONTENT_TYPE, type.toString());
+// Have the file parsed to get the content and metadata
 ContentHandler handler = new BodyContentHandler();
-parser.parse(new File(filename).toURI().toURL().openStream(), handler,
-metadata, new ParseContext());
+parser.parse(stream, handler, metadata, new ParseContext());
 
 return handler.toString();
 }




svn commit: r1694974 - /tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java

2015-08-10 Thread nick
Author: nick
Date: Mon Aug 10 07:00:03 2015
New Revision: 1694974

URL: http://svn.apache.org/r1694974
Log:
One more improvement

Modified:

tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java

Modified: 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java?rev=1694974r1=1694973r2=1694974view=diff
==
--- 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java 
(original)
+++ 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java 
Mon Aug 10 07:00:03 2015
@@ -67,7 +67,7 @@ public class MyFirstTika {
 
 AutoDetectParser parser = new AutoDetectParser(tikaConfig);
 ContentHandler handler = new BodyContentHandler();
-TikaInputStream stream = TikaInputStream.get(new File(filename));
+TikaInputStream stream = TikaInputStream.get(new File(filename), 
metadata);
 parser.parse(stream, handler, metadata, new ParseContext());
 return handler.toString();
 }




svn commit: r1694584 - /tika/trunk/tika-parent/pom.xml

2015-08-06 Thread nick
Author: nick
Date: Thu Aug  6 23:02:23 2015
New Revision: 1694584

URL: http://svn.apache.org/r1694584
Log:
Move to the most recent org.apache parent pom

Modified:
tika/trunk/tika-parent/pom.xml

Modified: tika/trunk/tika-parent/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parent/pom.xml?rev=1694584r1=1694583r2=1694584view=diff
==
--- tika/trunk/tika-parent/pom.xml (original)
+++ tika/trunk/tika-parent/pom.xml Thu Aug  6 23:02:23 2015
@@ -25,7 +25,7 @@
   parent
 groupIdorg.apache/groupId
 artifactIdapache/artifactId
-version10/version
+version17/version
 relativePath /
   /parent
 




svn commit: r1694585 - /tika/trunk/tika-core/pom.xml

2015-08-06 Thread nick
Author: nick
Date: Thu Aug  6 23:07:26 2015
New Revision: 1694585

URL: http://svn.apache.org/r1694585
Log:
More Tika Core rat excludes

Modified:
tika/trunk/tika-core/pom.xml

Modified: tika/trunk/tika-core/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/pom.xml?rev=1694585r1=1694584r2=1694585view=diff
==
--- tika/trunk/tika-core/pom.xml (original)
+++ tika/trunk/tika-core/pom.xml Thu Aug  6 23:07:26 2015
@@ -85,6 +85,8 @@
 configuration
   excludes
 excludesrc/test/resources/org/apache/tika/**/exclude
+
excludesrc/main/resources/org/apache/tika/language/*.ngp/exclude
+
excludesrc/main/resources/org/apache/tika/detect/*.nnmodel/exclude
   /excludes
 /configuration
   /plugin




svn commit: r1694587 - in /tika/trunk: tika-app/ tika-java7/ tika-server/ tika-translate/src/test/java/org/apache/tika/language/translate/

2015-08-06 Thread nick
Author: nick
Date: Thu Aug  6 23:18:34 2015
New Revision: 1694587

URL: http://svn.apache.org/r1694587
Log:
License headers and Apache Rat excludes

Modified:
tika/trunk/tika-app/pom.xml
tika/trunk/tika-java7/pom.xml
tika/trunk/tika-server/pom.xml

tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/Lingo24TranslatorTest.java

tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/MosesTranslatorTest.java

Modified: tika/trunk/tika-app/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-app/pom.xml?rev=1694587r1=1694586r2=1694587view=diff
==
--- tika/trunk/tika-app/pom.xml (original)
+++ tika/trunk/tika-app/pom.xml Thu Aug  6 23:18:34 2015
@@ -177,6 +177,15 @@
   /execution
 /executions
   /plugin
+  plugin
+groupIdorg.apache.rat/groupId
+artifactIdapache-rat-plugin/artifactId
+configuration
+  excludes
+excludesrc/test/resources/test-data/**/exclude
+  /excludes
+/configuration
+  /plugin
 /plugins
   /build
 

Modified: tika/trunk/tika-java7/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-java7/pom.xml?rev=1694587r1=1694586r2=1694587view=diff
==
--- tika/trunk/tika-java7/pom.xml (original)
+++ tika/trunk/tika-java7/pom.xml Thu Aug  6 23:18:34 2015
@@ -60,6 +60,16 @@
   /instructions
 /configuration
   /plugin
+  plugin
+groupIdorg.apache.rat/groupId
+artifactIdapache-rat-plugin/artifactId
+configuration
+  excludes
+
excludesrc/main/resources/META-INF/services/java.nio.file.spi.FileTypeDetector/exclude
+excludesrc/test/resources/test-documents/*/exclude
+  /excludes
+/configuration
+  /plugin
 /plugins
   /build
 

Modified: tika/trunk/tika-server/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-server/pom.xml?rev=1694587r1=1694586r2=1694587view=diff
==
--- tika/trunk/tika-server/pom.xml (original)
+++ tika/trunk/tika-server/pom.xml Thu Aug  6 23:18:34 2015
@@ -256,6 +256,17 @@
 /executions
   /plugin
   plugin
+groupIdorg.apache.rat/groupId
+artifactIdapache-rat-plugin/artifactId
+configuration
+  excludes
+excludesrc/main/resources/tikaserver-version.properties/exclude
+excludesrc/test/resources/*/exclude
+excludeREADME.md/exclude
+  /excludes
+/configuration
+  /plugin
+  plugin
 groupIdcom.qmino/groupId
 artifactIdmiredot-maven-plugin/artifactId
 version1.4/version

Modified: 
tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/Lingo24TranslatorTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/Lingo24TranslatorTest.java?rev=1694587r1=1694586r2=1694587view=diff
==
--- 
tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/Lingo24TranslatorTest.java
 (original)
+++ 
tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/Lingo24TranslatorTest.java
 Thu Aug  6 23:18:34 2015
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.tika.language.translate;
 
 import static org.junit.Assert.assertEquals;

Modified: 
tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/MosesTranslatorTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/MosesTranslatorTest.java?rev=1694587r1=1694586r2=1694587view=diff
==
--- 
tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/MosesTranslatorTest.java
 (original)
+++ 
tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/MosesTranslatorTest.java
 Thu Aug  6 23:18:34 2015
@@ -1,3 +1,19 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES

svn commit: r1693733 - /tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java

2015-08-01 Thread nick
Author: nick
Date: Sat Aug  1 17:02:26 2015
New Revision: 1693733

URL: http://svn.apache.org/r1693733
Log:
TIKA-1702 Move the parser and detector creation logic to the config loader 
classes

Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1693733r1=1693732r2=1693733view=diff
==
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
Sat Aug  1 17:02:26 2015
@@ -129,7 +129,7 @@ public class TikaConfig {
 DetectorXmlLoader detectorLoader = new DetectorXmlLoader();
 
 this.mimeTypes = typesFromDomElement(element);
-this.detector = detectorFromDomElement(element, mimeTypes, loader);
+this.detector = detectorLoader.loadOverall(element, mimeTypes, loader);
 this.parser = parserLoader.loadOverall(element, mimeTypes, loader);
 this.translator = translatorFromDomElement(element, loader);
 }
@@ -213,8 +213,7 @@ public class TikaConfig {
 
 this.mimeTypes = typesFromDomElement(element);
 this.parser = parserLoader.loadOverall(element, mimeTypes, 
loader);
-this.detector =
-detectorFromDomElement(element, mimeTypes, loader);
+this.detector = detectorLoader.loadOverall(element, mimeTypes, 
loader);
 this.translator = translatorFromDomElement(element, loader);
 } catch (SAXException e) {
 throw new TikaException(
@@ -358,137 +357,6 @@ public class TikaConfig {
 return getDefaultMimeTypes(null);
 }
 }
-
-//private static CompositeParser parserFromDomElement(
-//Element element, MimeTypes mimeTypes, ServiceLoader loader)
-//throws TikaException, IOException {
-//ListParser parsers = new ArrayListParser();
-//
-//// Find the parser children of the parsers tag, if any
-//for (Element pe : getTopLevelElementChildren(element, parsers, 
parser)) {
-//parsers.add(parserFromParserDomElement(pe, mimeTypes, loader));
-//}
-//
-//if (parsers.isEmpty()) {
-//// No parsers defined, create a DefaultParser
-//return getDefaultParser(mimeTypes, loader);
-//} else if (parsers.size() == 1  parsers.get(0) instanceof 
CompositeParser) {
-//// Single Composite defined, use that
-//return (CompositeParser)parsers.get(0);
-//} else {
-//// Wrap the defined parsers up in a Composite
-//MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
-//return new CompositeParser(registry, parsers);
-//}
-//}
-private static Parser parserFromParserDomElement(
-Element parserNode, MimeTypes mimeTypes, ServiceLoader loader)
-throws TikaException, IOException {
-String name = parserNode.getAttribute(class);
-Parser parser = null;
-
-try {
-Class? extends Parser parserClass =
-loader.getServiceClass(Parser.class, name);
-// https://issues.apache.org/jira/browse/TIKA-866
-if (AutoDetectParser.class.isAssignableFrom(parserClass)) {
-throw new TikaException(
-AutoDetectParser not supported in a parser
-+  configuration element:  + name);
-}
-
-// Is this a composite or decorated parser? If so, support 
recursion
-if (CompositeParser.class.isAssignableFrom(parserClass) ||
-ParserDecorator.class.isAssignableFrom(parserClass)) {
-
-// Get the child parsers for it
-ListParser childParsers = new ArrayListParser();
-NodeList childParserNodes = 
parserNode.getElementsByTagName(parser);
-if (childParserNodes.getLength()  0) {
-for (int i = 0; i  childParserNodes.getLength(); i++) {
-childParsers.add(parserFromParserDomElement(
-(Element)childParserNodes.item(i), mimeTypes, 
loader
-));
-}
-}
-
-// Get the list of parsers to exclude
-SetClass? extends Parser excludeParsers = new 
HashSetClass? extends Parser();
-NodeList excludeParserNodes = 
parserNode.getElementsByTagName(parser-exclude);
-if (excludeParserNodes.getLength()  0) {
-for (int i = 0; i  excludeParserNodes.getLength

svn commit: r1693747 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/config/ tika-core/src/main/java/org/apache/tika/language/translate/ tika-parsers/src/test/java/org/apache/tika/config/ ti

2015-08-01 Thread nick
Author: nick
Date: Sat Aug  1 17:53:53 2015
New Revision: 1693747

URL: http://svn.apache.org/r1693747
Log:
Convert Translator config to the new pattern for TIKA-1702, and add unit tests 
for Translator xml config

Added:

tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaTranslatorConfigTest.java

tika/trunk/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1702-translator-default.xml

tika/trunk/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty-default.xml

tika/trunk/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty.xml
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java

tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1693747r1=1693746r2=1693747view=diff
==
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
Sat Aug  1 17:53:53 2015
@@ -127,11 +127,12 @@ public class TikaConfig {
 throws TikaException, IOException {
 ParserXmlLoader parserLoader = new ParserXmlLoader();
 DetectorXmlLoader detectorLoader = new DetectorXmlLoader();
+TranslatorXmlLoader translatorLoader = new TranslatorXmlLoader();
 
 this.mimeTypes = typesFromDomElement(element);
 this.detector = detectorLoader.loadOverall(element, mimeTypes, loader);
 this.parser = parserLoader.loadOverall(element, mimeTypes, loader);
-this.translator = translatorFromDomElement(element, loader);
+this.translator = translatorLoader.loadOverall(element, mimeTypes, 
loader);
 }
 
 /**
@@ -210,11 +211,12 @@ public class TikaConfig {
 Element element = 
getBuilder().parse(stream).getDocumentElement();
 ParserXmlLoader parserLoader = new ParserXmlLoader();
 DetectorXmlLoader detectorLoader = new DetectorXmlLoader();
+TranslatorXmlLoader translatorLoader = new 
TranslatorXmlLoader();
 
 this.mimeTypes = typesFromDomElement(element);
 this.parser = parserLoader.loadOverall(element, mimeTypes, 
loader);
 this.detector = detectorLoader.loadOverall(element, mimeTypes, 
loader);
-this.translator = translatorFromDomElement(element, loader);
+this.translator = translatorLoader.loadOverall(element, 
mimeTypes, loader);
 } catch (SAXException e) {
 throw new TikaException(
 Specified Tika configuration has syntax errors: 
@@ -322,15 +324,24 @@ public class TikaConfig {
 }
 private static ListElement getTopLevelElementChildren(Element element, 
 String parentName, String childrenName) throws TikaException {
-// Should be only zero or one parsers / detectors etc tag
-NodeList nodes = element.getElementsByTagName(parentName);
-if (nodes.getLength()  1) {
-throw new TikaException(Properties may not contain multiple 
+parentName+ entries);
+Node parentNode = null;
+if (parentName != null) {
+// Should be only zero or one parsers / detectors etc tag
+NodeList nodes = element.getElementsByTagName(parentName);
+if (nodes.getLength()  1) {
+throw new TikaException(Properties may not contain multiple 
+parentName+ entries);
+}
+else if (nodes.getLength() == 1) {
+parentNode = nodes.item(0);
+}
+} else {
+// All children directly on the master element
+parentNode = element;
 }
-else if (nodes.getLength() == 1) {
+
+if (parentNode != null) {
 // Find only the direct child parser/detector objects
-Node parsersE = nodes.item(0);
-nodes = parsersE.getChildNodes();
+NodeList nodes = parentNode.getChildNodes();
 ListElement elements = new ArrayListElement();
 for (int i = 0; i  nodes.getLength(); i++) {
 Node node = nodes.item(i);
@@ -383,39 +394,9 @@ public class TikaConfig {
 if (types != null) return types;
 return Collections.emptySet();
 }
-
-private static Translator translatorFromDomElement(
-Element element, ServiceLoader loader)
-throws TikaException

svn commit: r1693716 [3/5] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.2/ publish/1.3/ pu

2015-08-01 Thread nick
Modified: tika/site/publish/1.5/formats.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.5/formats.html?rev=1693716r1=1693715r2=1693716view=diff
==
--- tika/site/publish/1.5/formats.html (original)
+++ tika/site/publish/1.5/formats.html Sat Aug  1 15:25:44 2015
@@ -204,6 +204,7 @@
   
   
   
+  
   
 li class=expanded
 a href=../1.9/index.htmlApache Tika 1.9/a
@@ -230,6 +231,10 @@
   /li
   
 li class=none
+a href=../1.9/configuring.htmlConfiguring Tika/a
+  /li
+  
+li class=none
 a href=../1.9/examples.htmlUsage Examples/a
   /li
   

Modified: tika/site/publish/1.5/gettingstarted.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.5/gettingstarted.html?rev=1693716r1=1693715r2=1693716view=diff
==
--- tika/site/publish/1.5/gettingstarted.html (original)
+++ tika/site/publish/1.5/gettingstarted.html Sat Aug  1 15:25:44 2015
@@ -256,6 +256,7 @@ curl http://.../document.doc \
   
   
   
+  
   
 li class=expanded
 a href=../1.9/index.htmlApache Tika 1.9/a
@@ -282,6 +283,10 @@ curl http://.../document.doc \
   /li
   
 li class=none
+a href=../1.9/configuring.htmlConfiguring Tika/a
+  /li
+  
+li class=none
 a href=../1.9/examples.htmlUsage Examples/a
   /li
   

Modified: tika/site/publish/1.5/index.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.5/index.html?rev=1693716r1=1693715r2=1693716view=diff
==
--- tika/site/publish/1.5/index.html (original)
+++ tika/site/publish/1.5/index.html Sat Aug  1 15:25:44 2015
@@ -203,6 +203,7 @@
   
   
   
+  
   
 li class=expanded
 a href=../1.9/index.htmlApache Tika 1.9/a
@@ -229,6 +230,10 @@
   /li
   
 li class=none
+a href=../1.9/configuring.htmlConfiguring Tika/a
+  /li
+  
+li class=none
 a href=../1.9/examples.htmlUsage Examples/a
   /li
   

Modified: tika/site/publish/1.5/parser.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.5/parser.html?rev=1693716r1=1693715r2=1693716view=diff
==
--- tika/site/publish/1.5/parser.html (original)
+++ tika/site/publish/1.5/parser.html Sat Aug  1 15:25:44 2015
@@ -215,6 +215,7 @@ try {
   
   
   
+  
   
 li class=expanded
 a href=../1.9/index.htmlApache Tika 1.9/a
@@ -241,6 +242,10 @@ try {
   /li
   
 li class=none
+a href=../1.9/configuring.htmlConfiguring Tika/a
+  /li
+  
+li class=none
 a href=../1.9/examples.htmlUsage Examples/a
   /li
   

Modified: tika/site/publish/1.5/parser_guide.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.5/parser_guide.html?rev=1693716r1=1693715r2=1693716view=diff
==
--- tika/site/publish/1.5/parser_guide.html (original)
+++ tika/site/publish/1.5/parser_guide.html Sat Aug  1 15:25:44 2015
@@ -216,6 +216,7 @@ public class HelloParser extends Abstrac
   
   
   
+  
   
 li class=expanded
 a href=../1.9/index.htmlApache Tika 1.9/a
@@ -242,6 +243,10 @@ public class HelloParser extends Abstrac
   /li
   
 li class=none
+a href=../1.9/configuring.htmlConfiguring Tika/a
+  /li
+  
+li class=none
 a href=../1.9/examples.htmlUsage Examples/a
   /li
   

Modified: tika/site/publish/1.6/detection.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.6/detection.html?rev=1693716r1=1693715r2=1693716view=diff
==
--- tika/site/publish/1.6/detection.html (original)
+++ tika/site/publish/1.6/detection.html Sat Aug  1 15:25:44 2015
@@ -200,6 +200,7 @@ for (InputStream is : myListOfStreams) {
   
   
   
+  
   
 li 

svn commit: r1693716 [5/5] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.2/ publish/1.3/ pu

2015-08-01 Thread nick
Modified: tika/site/publish/1.9/examples.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.9/examples.html?rev=1693716r1=1693715r2=1693716view=diff
==
--- tika/site/publish/1.9/examples.html (original)
+++ tika/site/publish/1.9/examples.html Sat Aug  1 15:25:44 2015
@@ -116,41 +116,41 @@
 pThe a href=./api/org/apache/tika/Tika.htmlTika facade/a, provides a 
number of very quick and easy ways to have your content parsed by Tika, and 
return the resulting plain text/pstyle type=text/css
@import url('attached-includes/css/shCoreDefault.css');
 /style
-div id=highlighter_294673 class=syntaxhighlighter nogutter  javatable 
border=0 cellpadding=0 cellspacing=0tbodytrtd class=codediv 
class=containerdiv class=line number49 index0 alt2code class=java 
keywordpublic/code code class=java plainString parseToStringExample() 
/codecode class=java keywordthrows/code code class=java 
plainIOException, SAXException, TikaException {/code/divdiv class=line 
number50 index1 alt1code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainInputStream 
stream = ParsingExample./codecode class=java keywordclass/codecode 
class=java plain.getResourceAsStream(/codecode class=java 
stringtest.doc/codecode class=java plain);/code/divdiv 
class=line number51 index2 alt2code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainTika tika = 
/codecode class=java keywordnew/code code class=java 
plainTika();/code/div
 div class=line number52 index3 alt1code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java keywordtry/code 
code class=java plain{/code/divdiv class=line number53 index4 
alt2code class=java 
spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode 
class=java keywordreturn/code code class=java 
plaintika.parseToString(stream);/code/divdiv class=line number54 index5 
alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode 
class=java plain} /codecode class=java keywordfinally/code code 
class=java plain{/code/divdiv class=line number55 index6 alt2code 
class=java 
spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode 
class=java plainstream.close();/code/divdiv class=line number56 index7 
alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode 
class=java plain}/code/divdiv class=line number57 index8 alt2code 
class=java plain}/code/div/
 div/td/tr/tbody/table/div/div
+div id=highlighter_34225 class=syntaxhighlighter nogutter  javatable 
border=0 cellpadding=0 cellspacing=0tbodytrtd class=codediv 
class=containerdiv class=line number53 index0 alt2code class=java 
keywordpublic/code code class=java plainString parseToStringExample() 
/codecode class=java keywordthrows/code code class=java 
plainIOException, SAXException, TikaException {/code/divdiv class=line 
number54 index1 alt1code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainInputStream 
stream = ParsingExample./codecode class=java keywordclass/codecode 
class=java plain.getResourceAsStream(/codecode class=java 
stringtest.doc/codecode class=java plain);/code/divdiv 
class=line number55 index2 alt2code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainTika tika = 
/codecode class=java keywordnew/code code class=java 
plainTika();/code/divd
 iv class=line number56 index3 alt1code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java keywordtry/code 
code class=java plain{/code/divdiv class=line number57 index4 
alt2code class=java 
spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode 
class=java keywordreturn/code code class=java 
plaintika.parseToString(stream);/code/divdiv class=line number58 index5 
alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode 
class=java plain} /codecode class=java keywordfinally/code code 
class=java plain{/code/divdiv class=line number59 index6 alt2code 
class=java 
spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode 
class=java plainstream.close();/code/divdiv class=line number60 index7 
alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode 
class=java plain}/code/divdiv class=line number61 index8 alt2code 
class=java plain}/code/div/d
 iv/td/tr/tbody/table/div/div
 div class=section
 h4a name=Parsing_using_the_Auto-Detect_ParserParsing using the 
Auto-Detect Parser/a/h4
-pFor more control, you can call the a 
href=./api/org/apache/tika/parser/Parser.htmlTika Parsers/a directly. Most 
likely, you'll want to start out using the a 
href=./api/org/apache/tika/parser/AutoDetectParser.htmlAuto-Detect 
Parser/a, which automatically figures out what kind of content you have, then 
calls the appropriate parser for you./pdiv id=highlighter_420078 
class=syntaxhighlighter nogutter  javatable border=0 cellpadding=0 
cellspacing=0tbodytrtd class=codediv class=containerdiv 
class=line number83 index0 alt2code class=java keywordpublic/code 
code class=java plainString parseExample() /codecode class=java 
keywordthrows/code code class=java plainIOException, SAXException, 
TikaException 

svn commit: r1693716 [4/5] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.2/ publish/1.3/ pu

2015-08-01 Thread nick
Modified: tika/site/publish/1.8/examples.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.8/examples.html?rev=1693716r1=1693715r2=1693716view=diff
==
--- tika/site/publish/1.8/examples.html (original)
+++ tika/site/publish/1.8/examples.html Sat Aug  1 15:25:44 2015
@@ -115,41 +115,41 @@
 pThe a href=./api/org/apache/tika/Tika.htmlTika facade/a, provides a 
number of very quick and easy ways to have your content parsed by Tika, and 
return the resulting plain text/pstyle type=text/css
@import url('attached-includes/css/shCoreDefault.css');
 /style
-div id=highlighter_489927 class=syntaxhighlighter nogutter  javatable 
border=0 cellpadding=0 cellspacing=0tbodytrtd class=codediv 
class=containerdiv class=line number49 index0 alt2code class=java 
keywordpublic/code code class=java plainString parseToStringExample() 
/codecode class=java keywordthrows/code code class=java 
plainIOException, SAXException, TikaException {/code/divdiv class=line 
number50 index1 alt1code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainInputStream 
stream = ParsingExample./codecode class=java keywordclass/codecode 
class=java plain.getResourceAsStream(/codecode class=java 
stringtest.doc/codecode class=java plain);/code/divdiv 
class=line number51 index2 alt2code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainTika tika = 
/codecode class=java keywordnew/code code class=java 
plainTika();/code/div
 div class=line number52 index3 alt1code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java keywordtry/code 
code class=java plain{/code/divdiv class=line number53 index4 
alt2code class=java 
spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode 
class=java keywordreturn/code code class=java 
plaintika.parseToString(stream);/code/divdiv class=line number54 index5 
alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode 
class=java plain} /codecode class=java keywordfinally/code code 
class=java plain{/code/divdiv class=line number55 index6 alt2code 
class=java 
spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode 
class=java plainstream.close();/code/divdiv class=line number56 index7 
alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode 
class=java plain}/code/divdiv class=line number57 index8 alt2code 
class=java plain}/code/div/
 div/td/tr/tbody/table/div/div
+div id=highlighter_823561 class=syntaxhighlighter nogutter  javatable 
border=0 cellpadding=0 cellspacing=0tbodytrtd class=codediv 
class=containerdiv class=line number53 index0 alt2code class=java 
keywordpublic/code code class=java plainString parseToStringExample() 
/codecode class=java keywordthrows/code code class=java 
plainIOException, SAXException, TikaException {/code/divdiv class=line 
number54 index1 alt1code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainInputStream 
stream = ParsingExample./codecode class=java keywordclass/codecode 
class=java plain.getResourceAsStream(/codecode class=java 
stringtest.doc/codecode class=java plain);/code/divdiv 
class=line number55 index2 alt2code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainTika tika = 
/codecode class=java keywordnew/code code class=java 
plainTika();/code/div
 div class=line number56 index3 alt1code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java keywordtry/code 
code class=java plain{/code/divdiv class=line number57 index4 
alt2code class=java 
spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode 
class=java keywordreturn/code code class=java 
plaintika.parseToString(stream);/code/divdiv class=line number58 index5 
alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode 
class=java plain} /codecode class=java keywordfinally/code code 
class=java plain{/code/divdiv class=line number59 index6 alt2code 
class=java 
spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode 
class=java plainstream.close();/code/divdiv class=line number60 index7 
alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode 
class=java plain}/code/divdiv class=line number61 index8 alt2code 
class=java plain}/code/div/
 div/td/tr/tbody/table/div/div
 div class=section
 h4a name=Parsing_using_the_Auto-Detect_ParserParsing using the 
Auto-Detect Parser/a/h4
-pFor more control, you can call the a 
href=./api/org/apache/tika/parser/Parser.htmlTika Parsers/a directly. Most 
likely, you'll want to start out using the a 
href=./api/org/apache/tika/parser/AutoDetectParser.htmlAuto-Detect 
Parser/a, which automatically figures out what kind of content you have, then 
calls the appropriate parser for you./pdiv id=highlighter_934037 
class=syntaxhighlighter nogutter  javatable border=0 cellpadding=0 
cellspacing=0tbodytrtd class=codediv class=containerdiv 
class=line number83 index0 alt2code class=java keywordpublic/code 
code class=java plainString parseExample() /codecode class=java 
keywordthrows/code code class=java plainIOException, SAXException, 
TikaException 

svn commit: r1693716 [1/5] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.2/ publish/1.3/ pu

2015-08-01 Thread nick
Author: nick
Date: Sat Aug  1 15:25:44 2015
New Revision: 1693716

URL: http://svn.apache.org/r1693716
Log:
Republish with the 1.9 Configuration section in the menu

Modified:
tika/site/publish/0.10/detection.html
tika/site/publish/0.10/formats.html
tika/site/publish/0.10/gettingstarted.html
tika/site/publish/0.10/index.html
tika/site/publish/0.10/parser.html
tika/site/publish/0.10/parser_guide.html
tika/site/publish/0.5/documentation.html
tika/site/publish/0.5/formats.html
tika/site/publish/0.5/gettingstarted.html
tika/site/publish/0.5/index.html
tika/site/publish/0.6/formats.html
tika/site/publish/0.6/gettingstarted.html
tika/site/publish/0.6/index.html
tika/site/publish/0.6/parser.html
tika/site/publish/0.7/detection.html
tika/site/publish/0.7/formats.html
tika/site/publish/0.7/gettingstarted.html
tika/site/publish/0.7/index.html
tika/site/publish/0.7/parser.html
tika/site/publish/0.7/parser_guide.html
tika/site/publish/0.8/detection.html
tika/site/publish/0.8/formats.html
tika/site/publish/0.8/gettingstarted.html
tika/site/publish/0.8/index.html
tika/site/publish/0.8/parser.html
tika/site/publish/0.8/parser_guide.html
tika/site/publish/0.9/detection.html
tika/site/publish/0.9/formats.html
tika/site/publish/0.9/gettingstarted.html
tika/site/publish/0.9/index.html
tika/site/publish/0.9/parser.html
tika/site/publish/0.9/parser_guide.html
tika/site/publish/1.0/detection.html
tika/site/publish/1.0/formats.html
tika/site/publish/1.0/gettingstarted.html
tika/site/publish/1.0/index.html
tika/site/publish/1.0/parser.html
tika/site/publish/1.0/parser_guide.html
tika/site/publish/1.1/detection.html
tika/site/publish/1.1/formats.html
tika/site/publish/1.1/gettingstarted.html
tika/site/publish/1.1/index.html
tika/site/publish/1.1/parser.html
tika/site/publish/1.1/parser_guide.html
tika/site/publish/1.10/examples.html
tika/site/publish/1.10/formats.html
tika/site/publish/1.2/detection.html
tika/site/publish/1.2/formats.html
tika/site/publish/1.2/gettingstarted.html
tika/site/publish/1.2/index.html
tika/site/publish/1.2/parser.html
tika/site/publish/1.2/parser_guide.html
tika/site/publish/1.3/detection.html
tika/site/publish/1.3/formats.html
tika/site/publish/1.3/gettingstarted.html
tika/site/publish/1.3/index.html
tika/site/publish/1.3/parser.html
tika/site/publish/1.3/parser_guide.html
tika/site/publish/1.4/detection.html
tika/site/publish/1.4/formats.html
tika/site/publish/1.4/gettingstarted.html
tika/site/publish/1.4/index.html
tika/site/publish/1.4/parser.html
tika/site/publish/1.4/parser_guide.html
tika/site/publish/1.5/detection.html
tika/site/publish/1.5/formats.html
tika/site/publish/1.5/gettingstarted.html
tika/site/publish/1.5/index.html
tika/site/publish/1.5/parser.html
tika/site/publish/1.5/parser_guide.html
tika/site/publish/1.6/detection.html
tika/site/publish/1.6/formats.html
tika/site/publish/1.6/gettingstarted.html
tika/site/publish/1.6/index.html
tika/site/publish/1.6/parser.html
tika/site/publish/1.6/parser_guide.html
tika/site/publish/1.7/detection.html
tika/site/publish/1.7/examples.html
tika/site/publish/1.7/formats.html
tika/site/publish/1.7/gettingstarted.html
tika/site/publish/1.7/index.html
tika/site/publish/1.7/parser.html
tika/site/publish/1.7/parser_guide.html
tika/site/publish/1.8/detection.html
tika/site/publish/1.8/examples.html
tika/site/publish/1.8/formats.html
tika/site/publish/1.8/gettingstarted.html
tika/site/publish/1.8/index.html
tika/site/publish/1.8/parser.html
tika/site/publish/1.8/parser_guide.html
tika/site/publish/1.9/detection.html
tika/site/publish/1.9/examples.html
tika/site/publish/1.9/formats.html
tika/site/publish/1.9/gettingstarted.html
tika/site/publish/1.9/index.html
tika/site/publish/1.9/parser.html
tika/site/publish/1.9/parser_guide.html
tika/site/publish/contribute.html
tika/site/publish/distribution-management.html
tika/site/publish/download.html
tika/site/publish/index.html
tika/site/publish/integration.html
tika/site/publish/issue-tracking.html
tika/site/publish/license.html
tika/site/publish/mail-lists.html
tika/site/publish/plugin-management.html
tika/site/publish/plugins.html
tika/site/publish/project-info.html
tika/site/publish/project-summary.html
tika/site/publish/source-repository.html
tika/site/publish/team-list.html
tika/site/src/site/site.xml

Modified: tika/site/publish/0.10/detection.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/0.10/detection.html?rev=1693716r1=1693715r2=1693716view=diff
==
--- tika/site/publish/0.10

svn commit: r1693746 - /tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java

2015-08-01 Thread nick
Author: nick
Date: Sat Aug  1 17:39:40 2015
New Revision: 1693746

URL: http://svn.apache.org/r1693746
Log:
Empty Translator, similar to the ones for Parser and Detector, for use in 
testing etc

Added:

tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java

Added: 
tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java?rev=1693746view=auto
==
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java
 (added)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java
 Sat Aug  1 17:39:40 2015
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.language.translate;
+
+/**
+ * Dummy translator that always declines to give any text. Useful as a 
+ * sentinel translator for when none others are available.
+ * for unknown document types.
+ */
+public class EmptyTranslator implements Translator {
+public String translate(String text, String sourceLanguage, String 
targetLanguage) {
+return null;
+}
+
+public String translate(String text, String targetLanguage) {
+return null;
+}
+
+public boolean isAvailable() {
+return true;
+}
+}




svn commit: r1693715 - in /tika/site: publish/1.9/configuring.html src/site/apt/1.9/configuring.apt

2015-08-01 Thread nick
Author: nick
Date: Sat Aug  1 15:24:45 2015
New Revision: 1693715

URL: http://svn.apache.org/r1693715
Log:
TIKA-1702 more documentation on configuration

Added:
tika/site/publish/1.9/configuring.html
Modified:
tika/site/src/site/apt/1.9/configuring.apt

Added: tika/site/publish/1.9/configuring.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.9/configuring.html?rev=1693715view=auto
==
--- tika/site/publish/1.9/configuring.html (added)
+++ tika/site/publish/1.9/configuring.html Sat Aug  1 15:24:45 2015
@@ -0,0 +1,394 @@
+!DOCTYPE html PUBLIC -//W3C//DTD XHTML 1.0 Transitional//EN
+  http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd;
+
+!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  License); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+ 
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+--
+
+
+
+
+
+
+
+html xmlns=http://www.w3.org/1999/xhtml;
+  head
+meta http-equiv=Content-Type content=text/html; charset=UTF-8 /
+titleApache Tika - Configuring Tika/title
+style type=text/css media=all
+  @import url(../css/site.css);
+/style
+link rel=icon type=image/png href=../tikaNoText16.png /
+script type=text/javascript
+  function selectProvider(form) {
+provider = form.elements['searchProvider'].value;
+if (provider == any) {
+  if (Math.random()  0.5) {
+provider = lucid;
+  } else {
+provider = sl;
+  }
+}
+if (provider == lucid) {
+  form.action = http://find.searchhub.org/p:tika;;
+} else if (provider == sl) {
+  form.action = http://search-lucene.com/tika;;
+}
+days = 90;
+date = new Date();
+date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000));
+expires = ; expires= + date.toGMTString();
+document.cookie = searchProvider= + provider + expires + ; path=/;
+  }
+  function initProvider() {
+if (document.cookie.length0) {
+  cStart=document.cookie.indexOf(searchProvider=);
+  if (cStart!=-1) {
+cStart=cStart + searchProvider=.length;
+cEnd=document.cookie.indexOf(;, cStart);
+if (cEnd==-1) {
+  cEnd=document.cookie.length;
+}
+provider = unescape(document.cookie.substring(cStart,cEnd));
+document.forms['searchform'].elements['searchProvider'].value = 
provider;
+  }
+}
+document.forms['searchform'].elements['q'].focus();
+  }
+/script
+  /head
+  body onLoad=initProvider();
+div id=body
+  div id=banner
+a href=http://tika.apache.org; id=bannerLeft title=Apache Tika
+  img src=http://tika.apache.org/tika.png; alt=Apache Tika
+width=292 height=100//a
+a href=http://www.apache.org/; id=bannerRight
+   title=The Apache Software Foundation
+  img src=http://tika.apache.org/asf-logo.gif; alt=The Apache 
Software Foundation
+width=387 height=100//a
+  /div
+  div id=content
+!-- Licensed to the Apache Software Foundation (ASF) under one or 
more --!-- contributor license agreements.  See the NOTICE file distributed 
with --!-- this work for additional information regarding copyright 
ownership. --!-- The ASF licenses this file to You under the Apache License, 
Version 2.0 --!-- (the License); you may not use this file except in 
compliance with --!-- the License.  You may obtain a copy of the License at 
--!--  --!-- http://www.apache.org/licenses/LICENSE-2.0 --!--  --!-- 
Unless required by applicable law or agreed to in writing, software --!-- 
distributed under the License is distributed on an AS IS BASIS, --!-- 
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
--!-- See the License for the specific language governing permissions and 
--!-- limitations under the License. --div class=section
+h2Configuring Tikaa name=Configuring_Tika/a/h2
+pOut of the box, Apache Tika will attempt to start with all available 
Detectors and Parsers, running with sensible defaults. For most users, this 
default configuration will work well./p
+pThis page gives you information on how to configure the various components 
of Apache Tika

svn commit: r1693745 - /tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java

2015-08-01 Thread nick
Author: nick
Date: Sat Aug  1 17:27:16 2015
New Revision: 1693745

URL: http://svn.apache.org/r1693745
Log:
If DefaultTranslator has multiple translators loaded, use the first available, 
not just blindly the first

Modified:

tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java?rev=1693745r1=1693744r2=1693745view=diff
==
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java
 (original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java
 Sat Aug  1 17:27:16 2015
@@ -17,15 +17,22 @@
 
 package org.apache.tika.language.translate;
 
-import org.apache.tika.config.ServiceLoader;
-import org.apache.tika.exception.TikaException;
-
 import java.io.IOException;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;
 
-public class DefaultTranslator implements Translator{
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.exception.TikaException;
+
+/**
+ * A translator which picks the first available {@link Translator} 
+ * implementations available through the
+ * {@link javax.imageio.spi.ServiceRegistry service provider mechanism}.
+ *
+ * @since Apache Tika 1.6
+ */
+public class DefaultTranslator implements Translator {
 private transient final ServiceLoader loader;
 
 public DefaultTranslator(ServiceLoader loader) {
@@ -58,17 +65,39 @@ public class DefaultTranslator implement
 });
 return translators;
 }
+/**
+ * Returns the first available translator, or null if none are
+ */
+private static Translator getFirstAvailable(ServiceLoader loader) {
+for (Translator t : getDefaultTranslators(loader)) {
+if (t.isAvailable()) return t;
+}
+return null;
+}
 
+/**
+ * Translate, using the first available service-loaded translator
+ */
 public String translate(String text, String sourceLanguage, String 
targetLanguage) throws TikaException, IOException {
-return getDefaultTranslators(loader).get(0).translate(text, 
sourceLanguage, targetLanguage);
+Translator t = getFirstAvailable(loader);
+if (t != null) {
+return t.translate(text, sourceLanguage, targetLanguage);
+}
+throw new TikaException(No translators currently available);
 }
 
+/**
+ * Translate, using the first available service-loaded translator
+ */
 public String translate(String text, String targetLanguage) throws 
TikaException, IOException {
-return getDefaultTranslators(loader).get(0).translate(text, 
targetLanguage);
+Translator t = getFirstAvailable(loader);
+if (t != null) {
+return t.translate(text, targetLanguage);
+}
+throw new TikaException(No translators currently available);
 }
 
 public boolean isAvailable() {
-return getDefaultTranslators(loader).get(0).isAvailable();
+return getFirstAvailable(loader) != null;
 }
-
 }




svn commit: r1693713 - in /tika/trunk/tika-core/src: main/java/org/apache/tika/config/TikaConfig.java test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java

2015-08-01 Thread nick
Author: nick
Date: Sat Aug  1 14:53:36 2015
New Revision: 1693713

URL: http://svn.apache.org/r1693713
Log:
TIKA-1702 Refactor some of the config parser loading to be more re-usable for 
detectors, and bring the method signature in line WRT Composite vs not (must 
always be composite)

Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java

tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1693713r1=1693712r2=1693713view=diff
==
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
Sat Aug  1 14:53:36 2015
@@ -66,7 +66,7 @@ public class TikaConfig {
 return MimeTypes.getDefaultMimeTypes(loader);
 }
 
-protected Detector getDefaultDetector(
+protected CompositeDetector getDefaultDetector(
 MimeTypes types, ServiceLoader loader) {
 return new DefaultDetector(types, loader);
 }
@@ -80,7 +80,7 @@ public class TikaConfig {
 return new DefaultTranslator(loader);
 }
 private final CompositeParser parser;
-private final Detector detector;
+private final CompositeDetector detector;
 private final Translator translator;
 
 private final MimeTypes mimeTypes;
@@ -317,6 +317,33 @@ public class TikaConfig {
 }
 return null;
 }
+private static ListElement getTopLevelElementChildren(Element element, 
+String parentName, String childrenName) throws TikaException {
+// Should be only zero or one parsers / detectors etc tag
+NodeList nodes = element.getElementsByTagName(parentName);
+if (nodes.getLength()  1) {
+throw new TikaException(Properties may not contain multiple 
+parentName+ entries);
+}
+else if (nodes.getLength() == 1) {
+// Find only the direct child parser/detector objects
+Node parsersE = nodes.item(0);
+nodes = parsersE.getChildNodes();
+ListElement elements = new ArrayListElement();
+for (int i = 0; i  nodes.getLength(); i++) {
+Node node = nodes.item(i);
+if (node instanceof Element) {
+Element nodeE = (Element)node;
+if (childrenName.equals(nodeE.getTagName())) {
+elements.add(nodeE);
+}
+}
+}
+return elements;
+} else {
+// No elements of this type
+return Collections.emptyList();
+}
+}
 
 private static MimeTypes typesFromDomElement(Element element)
 throws TikaException, IOException {
@@ -333,24 +360,9 @@ public class TikaConfig {
 throws TikaException, IOException {
 ListParser parsers = new ArrayListParser();
 
-// Should be only zero or one parsers tag
-NodeList nodes = element.getElementsByTagName(parsers);
-if (nodes.getLength()  1) {
-throw new TikaException(Properties may not contain multiple 
Parsers entries);
-}
-else if (nodes.getLength() == 1) {
-// Find only the direct child parser objects
-Node parsersE = nodes.item(0);
-nodes = parsersE.getChildNodes();
-for (int i = 0; i  nodes.getLength(); i++) {
-Node node = nodes.item(i);
-if (node instanceof Element) {
-Element nodeE = (Element)node;
-if (parser.equals(nodeE.getTagName())) {
-parsers.add(parserFromParserDomElement(nodeE, 
mimeTypes, loader));
-}
-}
-}
+// Find the parser children of the parsers tag, if any
+for (Element pe : getTopLevelElementChildren(element, parsers, 
parser)) {
+parsers.add(parserFromParserDomElement(pe, mimeTypes, loader));
 }
 
 if (parsers.isEmpty()) {
@@ -500,7 +512,7 @@ public class TikaConfig {
 return Collections.emptySet();
 }
 
-private static Detector detectorFromDomElement(
+private static CompositeDetector detectorFromDomElement(
   Element element, MimeTypes mimeTypes, ServiceLoader loader)
   throws TikaException, IOException {
ListDetector detectors = new ArrayListDetector();

Modified: 
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java?rev=1693713r1

svn commit: r1693721 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/config/TikaConfig.java tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java

2015-08-01 Thread nick
Author: nick
Date: Sat Aug  1 16:22:28 2015
New Revision: 1693721

URL: http://svn.apache.org/r1693721
Log:
TIKA-1702 Start moving to a loader class pattern for common Detector and Parser 
(+later others)

Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1693721r1=1693720r2=1693721view=diff
==
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
Sat Aug  1 16:22:28 2015
@@ -125,9 +125,12 @@ public class TikaConfig {
 
 private TikaConfig(Element element, ServiceLoader loader)
 throws TikaException, IOException {
+ParserXmlLoader parserLoader = new ParserXmlLoader();
+DetectorXmlLoader detectorLoader = new DetectorXmlLoader();
+
 this.mimeTypes = typesFromDomElement(element);
 this.detector = detectorFromDomElement(element, mimeTypes, loader);
-this.parser = parserFromDomElement(element, mimeTypes, loader);
+this.parser = parserLoader.loadOverall(element, mimeTypes, loader);
 this.translator = translatorFromDomElement(element, loader);
 }
 
@@ -204,11 +207,12 @@ public class TikaConfig {
 }
 
 try {
-Element element =
-getBuilder().parse(stream).getDocumentElement();
+Element element = 
getBuilder().parse(stream).getDocumentElement();
+ParserXmlLoader parserLoader = new ParserXmlLoader();
+DetectorXmlLoader detectorLoader = new DetectorXmlLoader();
+
 this.mimeTypes = typesFromDomElement(element);
-this.parser =
-parserFromDomElement(element, mimeTypes, loader);
+this.parser = parserLoader.loadOverall(element, mimeTypes, 
loader);
 this.detector =
 detectorFromDomElement(element, mimeTypes, loader);
 this.translator = translatorFromDomElement(element, loader);
@@ -355,28 +359,28 @@ public class TikaConfig {
 }
 }
 
-private static CompositeParser parserFromDomElement(
-Element element, MimeTypes mimeTypes, ServiceLoader loader)
-throws TikaException, IOException {
-ListParser parsers = new ArrayListParser();
-
-// Find the parser children of the parsers tag, if any
-for (Element pe : getTopLevelElementChildren(element, parsers, 
parser)) {
-parsers.add(parserFromParserDomElement(pe, mimeTypes, loader));
-}
-
-if (parsers.isEmpty()) {
-// No parsers defined, create a DefaultParser
-return getDefaultParser(mimeTypes, loader);
-} else if (parsers.size() == 1  parsers.get(0) instanceof 
CompositeParser) {
-// Single Composite defined, use that
-return (CompositeParser)parsers.get(0);
-} else {
-// Wrap the defined parsers up in a Composite
-MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
-return new CompositeParser(registry, parsers);
-}
-}
+//private static CompositeParser parserFromDomElement(
+//Element element, MimeTypes mimeTypes, ServiceLoader loader)
+//throws TikaException, IOException {
+//ListParser parsers = new ArrayListParser();
+//
+//// Find the parser children of the parsers tag, if any
+//for (Element pe : getTopLevelElementChildren(element, parsers, 
parser)) {
+//parsers.add(parserFromParserDomElement(pe, mimeTypes, loader));
+//}
+//
+//if (parsers.isEmpty()) {
+//// No parsers defined, create a DefaultParser
+//return getDefaultParser(mimeTypes, loader);
+//} else if (parsers.size() == 1  parsers.get(0) instanceof 
CompositeParser) {
+//// Single Composite defined, use that
+//return (CompositeParser)parsers.get(0);
+//} else {
+//// Wrap the defined parsers up in a Composite
+//MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
+//return new CompositeParser(registry, parsers);
+//}
+//}
 private static Parser parserFromParserDomElement(
 Element parserNode, MimeTypes mimeTypes, ServiceLoader loader)
 throws TikaException, IOException {
@@ -585,4 +589,79 @@ public class TikaConfig {
 return translators.get(0);
 }
 }
+
+private static

svn commit: r1693762 - /tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java

2015-08-01 Thread nick
Author: nick
Date: Sat Aug  1 20:58:07 2015
New Revision: 1693762

URL: http://svn.apache.org/r1693762
Log:
Fix up the Probabilistic Mime Detection Test

Modified:

tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java

Modified: 
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java?rev=1693762r1=1693761r2=1693762view=diff
==
--- 
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java
 (original)
+++ 
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java
 Sat Aug  1 20:58:07 2015
@@ -18,6 +18,7 @@
 package org.apache.tika.mime;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
 
 import java.io.ByteArrayInputStream;
@@ -28,10 +29,7 @@ import java.nio.charset.Charset;
 
 import org.apache.tika.Tika;
 import org.apache.tika.config.ServiceLoader;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.detect.CompositeDetector;
 import org.apache.tika.detect.DefaultProbDetector;
-import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.ProbabilisticMimeDetectionSelector.Builder;
 import org.junit.Before;
@@ -39,41 +37,31 @@ import org.junit.Test;
 
 public class ProbabilisticMimeDetectionTestWithTika {
 private static final Charset UTF8 = Charset.forName(UTF-8);
-// private ProbabilisticMimeDetectionSelector proDetector;
-private Tika tika;
+
+private ProbabilisticMimeDetectionSelector proSelector;
 private MediaTypeRegistry registry;
+private Tika tika;
 
 /** @inheritDoc */
 @Before
 public void setUp() {
-try {
-registry = MimeTypes.getDefaultMimeTypes().getMediaTypeRegistry();
-tika = new Tika(new TikaConfig() {
-@Override
-protected CompositeDetector getDefaultDetector(MimeTypes types,
-ServiceLoader loader) {
-/*
- * here is an example with the use of the builder to
- * instantiate the object.
- */
-Builder builder = new 
ProbabilisticMimeDetectionSelector.Builder();
-ProbabilisticMimeDetectionSelector proDetector = new 
ProbabilisticMimeDetectionSelector(
-types, builder.priorMagicFileType(0.5f)
-.priorExtensionFileType(0.5f)
-.priorMetaFileType(0.5f));
-return new DefaultProbDetector(proDetector, loader);
-}
-});
-} catch (TikaException e) {
-// TODO Auto-generated catch block
-e.printStackTrace();
-} catch (IOException e) {
-// TODO Auto-generated catch block
-e.printStackTrace();
-} finally {
-
-}
-
+MimeTypes types = MimeTypes.getDefaultMimeTypes();
+ServiceLoader loader = new ServiceLoader();
+registry = types.getMediaTypeRegistry();
+
+/*
+ * here is an example with the use of the builder to
+ * instantiate the object.
+ */
+Builder builder = new ProbabilisticMimeDetectionSelector.Builder();
+proSelector = new ProbabilisticMimeDetectionSelector(
+types, builder.priorMagicFileType(0.5f)
+.priorExtensionFileType(0.5f)
+.priorMetaFileType(0.5f));
+DefaultProbDetector detector = new DefaultProbDetector(proSelector, 
loader);
+
+// Use a default Tika, except for our different detector
+tika = new Tika(detector);
 }
 
 @Test
@@ -198,11 +186,6 @@ public class ProbabilisticMimeDetectionT
 }
 }
 
-private void assertNotNull(String string, InputStream in) {
-// TODO Auto-generated method stub
-
-}
-
 /**
  * Test for type detection of empty documents.
  * 




svn commit: r1693759 - in /tika/trunk/tika-core/src: main/java/org/apache/tika/config/ test/java/org/apache/tika/config/ test/resources/org/apache/tika/config/

2015-08-01 Thread nick
Author: nick
Date: Sat Aug  1 20:40:39 2015
New Revision: 1693759

URL: http://svn.apache.org/r1693759
Log:
TIKA-1700 Add TikaConfig constructors that take a ServiceLoader, and add a unit 
test that shows we (now) use the LoadErrorHandler on that properly for 
reporting problems with listed class names

Added:

tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1700-unknown-parser.xml
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java

tika/trunk/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java

tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java?rev=1693759r1=1693758r2=1693759view=diff
==
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java 
(original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java 
Sat Aug  1 20:40:39 2015
@@ -173,6 +173,10 @@ public class ServiceLoader {
 /**
  * Loads and returns the named service class that's expected to implement
  * the given interface.
+ * 
+ * Note that this class does not use the {@link LoadErrorHandler}, a
+ *  {@link ClassNotFoundException} is always returned for unknown
+ *  classes or classes of the wrong type
  *
  * @param iface service interface
  * @param name service class name

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1693759r1=1693758r2=1693759view=diff
==
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
Sat Aug  1 20:40:39 2015
@@ -92,18 +92,25 @@ public class TikaConfig {
 
 public TikaConfig(File file)
 throws TikaException, IOException, SAXException {
-this(getBuilder().parse(file));
+this(file, new ServiceLoader());
+}
+public TikaConfig(File file, ServiceLoader loader)
+throws TikaException, IOException, SAXException {
+this(getBuilder().parse(file), loader);
 }
 
 public TikaConfig(URL url)
 throws TikaException, IOException, SAXException {
 this(url, ServiceLoader.getContextClassLoader());
 }
-
 public TikaConfig(URL url, ClassLoader loader)
 throws TikaException, IOException, SAXException {
 this(getBuilder().parse(url.toString()).getDocumentElement(), loader);
 }
+public TikaConfig(URL url, ServiceLoader loader)
+throws TikaException, IOException, SAXException {
+this(getBuilder().parse(url.toString()).getDocumentElement(), loader);
+}
 
 public TikaConfig(InputStream stream)
 throws TikaException, IOException, SAXException {
@@ -113,6 +120,9 @@ public class TikaConfig {
 public TikaConfig(Document document) throws TikaException, IOException {
 this(document.getDocumentElement());
 }
+public TikaConfig(Document document, ServiceLoader loader) throws 
TikaException, IOException {
+this(document.getDocumentElement(), loader);
+}
 
 public TikaConfig(Element element) throws TikaException, IOException {
 this(element, new ServiceLoader());
@@ -418,7 +428,8 @@ public class TikaConfig {
 
 // Find the children of the parent tag, if any
 for (Element le : getTopLevelElementChildren(element, 
getParentTagName(), getLoaderTagName())) {
-loaded.add(loadOne(le, mimeTypes, loader));
+T loadedChild = loadOne(le, mimeTypes, loader);
+if (loadedChild != null) loaded.add(loadedChild);
 }
 
 // Build the classes, and wrap as needed
@@ -462,9 +473,9 @@ public class TikaConfig {
 NodeList childNodes = 
element.getElementsByTagName(getLoaderTagName());
 if (childNodes.getLength()  0) {
 for (int i = 0; i  childNodes.getLength(); i++) {
-children.add(loadOne(
-(Element)childNodes.item(i), mimeTypes, 
loader
-));
+T loadedChild = 
loadOne((Element)childNodes.item(i), 
+mimeTypes, loader);
+if (loadedChild != null) children.add(loadedChild

svn commit: r1688805 - /tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

2015-07-02 Thread nick
Author: nick
Date: Thu Jul  2 10:35:06 2015
New Revision: 1688805

URL: http://svn.apache.org/r1688805
Log:
Remove change comment, TIKA-1602

Modified:

tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

Modified: 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1688805r1=1688804r2=1688805view=diff
==
--- 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
(original)
+++ 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
Thu Jul  2 10:35:06 2015
@@ -5112,7 +5112,7 @@
 
   mime-type type=message/rfc822
 magic priority=50
-  match value=Status: type=string offset=0/   !-- added 
custom by Jeremy B. Merril 4/10/14 --
+  match value=Status: type=string offset=0/
   match value=Relay-Version: type=stringignorecase offset=0/
   match value=#!\ rnews type=string offset=0/
   match value=N#!\ rnews type=string offset=0/




svn commit: r1688810 - in /tika/site: publish/1.10/ publish/1.6/ publish/1.7/ publish/1.8/ publish/1.9/ src/site/apt/1.10/ src/site/apt/1.6/ src/site/apt/1.7/ src/site/apt/1.8/ src/site/apt/1.9/

2015-07-02 Thread nick
Author: nick
Date: Thu Jul  2 12:17:21 2015
New Revision: 1688810

URL: http://svn.apache.org/r1688810
Log:
Mention Outlook MSG support in the mail formats section

Modified:
tika/site/publish/1.10/formats.html
tika/site/publish/1.6/formats.html
tika/site/publish/1.7/formats.html
tika/site/publish/1.8/formats.html
tika/site/publish/1.9/formats.html
tika/site/src/site/apt/1.10/formats.apt
tika/site/src/site/apt/1.6/formats.apt
tika/site/src/site/apt/1.7/formats.apt
tika/site/src/site/apt/1.8/formats.apt
tika/site/src/site/apt/1.9/formats.apt

Modified: tika/site/publish/1.10/formats.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.10/formats.html?rev=1688810r1=1688809r2=1688810view=diff
==
--- tika/site/publish/1.10/formats.html (original)
+++ tika/site/publish/1.10/formats.html Thu Jul  2 12:17:21 2015
@@ -176,6 +176,7 @@
 pThe a 
href=./api/org/apache/tika/parser/mbox/MboxParser.htmlMboxParser/a can 
extract email messages from the mbox format used by many email archives and 
Unix-style mailboxes./p
 pThe a 
href=./api/org/apache/tika/parser/mail/RFC822Parser.htmlRFC822Parser/a can 
process single email messages in the RFC 822 format used by many email clients 
in their archives / exports./p
 pThe a 
href=./api/org/apache/tika/parser/mbox/OutlookPSTParser.htmlOutlookPSTParser/a
 can extract email messages from the Microsoft Outlook PST email format./p
+pThe a 
href=./api/org/apache/tika/parser/microsoft/OutlookExtractor.htmlOutlookExtractor/a
 (part of a 
href=./api/org/apache/tika/parser/microsoft/OfficeParserOfficeParser/a) is 
able to extract email messages from the Microsoft Outlook MSG email format./p
 pThe a 
href=./api/org/apache/tika/parser/microsoft/TNEFParser.htmlTNEFParser/a 
can extract email attachments from the Microsoft TNEF (Transport Neutral 
Encoding Format, aka Winmail.dat) used with some Microsoft email 
clients./p/div
 div class=section
 h3a name=CAD_formatsCAD formats/a/h3

Modified: tika/site/publish/1.6/formats.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.6/formats.html?rev=1688810r1=1688809r2=1688810view=diff
==
--- tika/site/publish/1.6/formats.html (original)
+++ tika/site/publish/1.6/formats.html Thu Jul  2 12:17:21 2015
@@ -172,7 +172,8 @@
 h3a name=Mail_formatsMail formats/a/h3
 pThe a 
href=./api/org/apache/tika/parser/mbox/MboxParser.htmlMboxParser/a can 
extract email messages from the mbox format used by many email archives and 
Unix-style mailboxes./p
 pThe a 
href=./api/org/apache/tika/parser/mail/RFC822Parser.htmlRFC822Parser/a can 
process single email messages in the RFC 822 format used by many email clients 
in their archives / exports./p
-pThe a 
href=./api/org/apache/tika/parser/mbox/PSTParser.htmlPSDParser/a can 
extract email messages from the Microsoft Outlook PST email format./p/div
+pThe a 
href=./api/org/apache/tika/parser/mbox/PSTParser.htmlPSDParser/a can 
extract email messages from the Microsoft Outlook PST email format./p
+pThe a 
href=./api/org/apache/tika/parser/microsoft/OutlookExtractor.htmlOutlookExtractor/a
 (part of a 
href=./api/org/apache/tika/parser/microsoft/OfficeParserOfficeParser/a) is 
able to extract email messages from the Microsoft Outlook MSG email 
format./p/div
 div class=section
 h3a name=CAD_formatsCAD formats/a/h3
 pThe a href=./api/org/apache/tika/parser/dwg/DWGParser.htmlDWGParser/a 
can extract simple metadata from the DWG CAD format./p/div

Modified: tika/site/publish/1.7/formats.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.7/formats.html?rev=1688810r1=1688809r2=1688810view=diff
==
--- tika/site/publish/1.7/formats.html (original)
+++ tika/site/publish/1.7/formats.html Thu Jul  2 12:17:21 2015
@@ -174,7 +174,8 @@
 h3a name=Mail_formatsMail formats/a/h3
 pThe a 
href=./api/org/apache/tika/parser/mbox/MboxParser.htmlMboxParser/a can 
extract email messages from the mbox format used by many email archives and 
Unix-style mailboxes./p
 pThe a 
href=./api/org/apache/tika/parser/mail/RFC822Parser.htmlRFC822Parser/a can 
process single email messages in the RFC 822 format used by many email clients 
in their archives / exports./p
-pThe a 
href=./api/org/apache/tika/parser/mbox/PSTParser.htmlPSDParser/a can 
extract email messages from the Microsoft Outlook PST email format./p/div
+pThe a 
href=./api/org/apache/tika/parser/mbox/PSTParser.htmlPSDParser/a can 
extract email messages from the Microsoft Outlook PST email format./p
+pThe a 
href=./api/org/apache/tika/parser/microsoft/OutlookExtractor.htmlOutlookExtractor/a
 (part of a 
href=./api/org/apache/tika/parser/microsoft/OfficeParserOfficeParser/a) is 
able to extract email messages from the Microsoft Outlook MSG email 
format./p/div
 div class=section
 h3a name=CAD_formatsCAD

svn commit: r1687945 - in /tika/site/src/site/apt: 0.10/parser.apt 1.10/examples.apt 1.3/parser.apt 1.4/parser.apt 1.5/parser.apt 1.6/parser.apt 1.7/examples.apt 1.7/parser.apt 1.8/examples.apt 1.8/pa

2015-06-27 Thread nick
Author: nick
Date: Sat Jun 27 16:13:01 2015
New Revision: 1687945

URL: http://svn.apache.org/r1687945
Log:
Tika javadocs are in /api/ not /apidocs/, correct links

Modified:
tika/site/src/site/apt/0.10/parser.apt
tika/site/src/site/apt/1.10/examples.apt
tika/site/src/site/apt/1.3/parser.apt
tika/site/src/site/apt/1.4/parser.apt
tika/site/src/site/apt/1.5/parser.apt
tika/site/src/site/apt/1.6/parser.apt
tika/site/src/site/apt/1.7/examples.apt
tika/site/src/site/apt/1.7/parser.apt
tika/site/src/site/apt/1.8/examples.apt
tika/site/src/site/apt/1.8/parser.apt
tika/site/src/site/apt/1.9/examples.apt
tika/site/src/site/apt/1.9/parser.apt

Modified: tika/site/src/site/apt/0.10/parser.apt
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/0.10/parser.apt?rev=1687945r1=1687944r2=1687945view=diff
==
--- tika/site/src/site/apt/0.10/parser.apt (original)
+++ tika/site/src/site/apt/0.10/parser.apt Sat Jun 27 16:13:01 2015
@@ -134,7 +134,7 @@ try {
 ---
 
Parser implementations typically use the
-   
{{{./apidocs/org/apache/tika/sax/XHTMLContentHandler.html}XHTMLContentHandler}}
+   {{{./api/org/apache/tika/sax/XHTMLContentHandler.html}XHTMLContentHandler}}
utility class to generate the XHTML output.
 
Dealing with the raw SAX events can be a bit complex, so Apache Tika
@@ -238,7 +238,7 @@ try {
 
Tika also contains some general purpose parser implementations that are
not targeted at any specific document formats. The most notable of these
-   is the 
{{{./apidocs/org/apache/tika/parser/AutoDetectParser.html}AutoDetectParser}}
+   is the 
{{{./api/org/apache/tika/parser/AutoDetectParser.html}AutoDetectParser}}
class that encapsulates all Tika functionality into a single parser that
can handle any types of documents. This parser will automatically determine
the type of the incoming document based on various heuristics and will then

Modified: tika/site/src/site/apt/1.10/examples.apt
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/1.10/examples.apt?rev=1687945r1=1687944r2=1687945view=diff
==
--- tika/site/src/site/apt/1.10/examples.apt (original)
+++ tika/site/src/site/apt/1.10/examples.apt Sat Jun 27 16:13:01 2015
@@ -34,7 +34,7 @@ Apache Tika API Usage Examples
 
 ** {Parsing using the Tika Facade}
 
-   The {{{./apidocs/org/apache/tika/Tika.html}Tika facade}},
+   The {{{./api/org/apache/tika/Tika.html}Tika facade}},
provides a number of very quick and easy ways to have your content
parsed by Tika, and return the resulting plain text
 
@@ -43,9 +43,9 @@ Apache Tika API Usage Examples
 ** {Parsing using the Auto-Detect Parser}
 
For more control, you can call the
-   {{{./apidocs/org/apache/tika/parser/Parser.html}Tika Parsers}}
+   {{{./api/org/apache/tika/parser/Parser.html}Tika Parsers}}
directly. Most likely, you'll want to start out using the 
-   {{{./apidocs/org/apache/tika/parser/AutoDetectParser.html}Auto-Detect 
Parser}},
+   {{{./api/org/apache/tika/parser/AutoDetectParser.html}Auto-Detect Parser}},
which automatically figures out what kind of content you have, then calls 
the appropriate
parser for you.
 
@@ -63,7 +63,7 @@ Apache Tika API Usage Examples
 ** {Parsing to Plain Text}
 
By using the 
-   
{{{./apidocs/org/apache/tika/sax/BodyContentHandler.html}BodyContentHandler}},
+   {{{./api/org/apache/tika/sax/BodyContentHandler.html}BodyContentHandler}},
you can request that Tika return only the content of the document's body as
a plain-text string.
 
@@ -72,15 +72,15 @@ Apache Tika API Usage Examples
 ** {Parsing to XHTML}
 
By using the 
-   
{{{./apidocs/org/apache/tika/sax/ToXMLContentHandler.html}ToXMLContentHandler}},
+   {{{./api/org/apache/tika/sax/ToXMLContentHandler.html}ToXMLContentHandler}},
you can get the XHTML content of the whole document as a string.
 
 
%{include|source=src/examples-src/main/java/org/apache/tika/example/ContentHandlerExample.java|snippet=aj:..parseToHTML()|show-gutter=false}
 
If you just want the body of the xhtml document, without the header, you
can chain together a 
-   
{{{./apidocs/org/apache/tika/sax/BodyContentHandler.html}BodyContentHandler}}
-   and a 
{{{./apidocs/org/apache/tika/sax/ToXMLContentHandler.html}ToXMLContentHandler}}
+   {{{./api/org/apache/tika/sax/BodyContentHandler.html}BodyContentHandler}}
+   and a 
{{{./api/org/apache/tika/sax/ToXMLContentHandler.html}ToXMLContentHandler}}
as shown:
 
 
%{include|source=src/examples-src/main/java/org/apache/tika/example/ContentHandlerExample.java|snippet=aj:..parseBodyToHTML()|show-gutter=false}
@@ -103,7 +103,7 @@ Apache Tika API Usage Examples
 ** {Extract Phone Numbers from Content into the Metadata}
 
By using the 
-   
{{{./apidocs/org/apache/tika/sax/PhoneExtractingContentHandler.html

svn commit: r1687946 [2/4] - in /tika/site/publish: 0.10/parser.html 1.10/examples.html 1.3/parser.html 1.4/parser.html 1.5/parser.html 1.6/parser.html 1.7/examples.html 1.7/parser.html 1.8/examples.h

2015-06-27 Thread nick
Modified: tika/site/publish/1.7/examples.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.7/examples.html?rev=1687946r1=1687945r2=1687946view=diff
==
--- tika/site/publish/1.7/examples.html (original)
+++ tika/site/publish/1.7/examples.html Sat Jun 27 16:15:19 2015
@@ -112,23 +112,23 @@
 pTika provides a number of different ways to parse a file. These provide 
different levels of control, flexibility, and complexity./p
 div class=section
 h4a name=Parsing_using_the_Tika_FacadeParsing using the Tika 
Facade/a/h4
-pThe a href=./apidocs/org/apache/tika/Tika.htmlTika facade/a, provides 
a number of very quick and easy ways to have your content parsed by Tika, and 
return the resulting plain text/pstyle type=text/css
+pThe a href=./api/org/apache/tika/Tika.htmlTika facade/a, provides a 
number of very quick and easy ways to have your content parsed by Tika, and 
return the resulting plain text/pstyle type=text/css
@import url('attached-includes/css/shCoreDefault.css');
 /style
 div id=highlighter_166145 class=syntaxhighlighter nogutter  javatable 
border=0 cellpadding=0 cellspacing=0tbodytrtd class=codediv 
class=containerdiv class=line number49 index0 alt2code class=java 
keywordpublic/code code class=java plainString parseToStringExample() 
/codecode class=java keywordthrows/code code class=java 
plainIOException, SAXException, TikaException {/code/divdiv class=line 
number50 index1 alt1code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainInputStream 
stream = ParsingExample./codecode class=java keywordclass/codecode 
class=java plain.getResourceAsStream(/codecode class=java 
stringtest.doc/codecode class=java plain);/code/divdiv 
class=line number51 index2 alt2code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainTika tika = 
/codecode class=java keywordnew/code code class=java 
plainTika();/code/div
 div class=line number52 index3 alt1code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java keywordtry/code 
code class=java plain{/code/divdiv class=line number53 index4 
alt2code class=java 
spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode 
class=java keywordreturn/code code class=java 
plaintika.parseToString(stream);/code/divdiv class=line number54 index5 
alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode 
class=java plain} /codecode class=java keywordfinally/code code 
class=java plain{/code/divdiv class=line number55 index6 alt2code 
class=java 
spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode 
class=java plainstream.close();/code/divdiv class=line number56 index7 
alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode 
class=java plain}/code/divdiv class=line number57 index8 alt2code 
class=java plain}/code/div/
 div/td/tr/tbody/table/div/div
 div class=section
 h4a name=Parsing_using_the_Auto-Detect_ParserParsing using the 
Auto-Detect Parser/a/h4
-pFor more control, you can call the a 
href=./apidocs/org/apache/tika/parser/Parser.htmlTika Parsers/a directly. 
Most likely, you'll want to start out using the a 
href=./apidocs/org/apache/tika/parser/AutoDetectParser.htmlAuto-Detect 
Parser/a, which automatically figures out what kind of content you have, then 
calls the appropriate parser for you./pdiv id=highlighter_969506 
class=syntaxhighlighter nogutter  javatable border=0 cellpadding=0 
cellspacing=0tbodytrtd class=codediv class=containerdiv 
class=line number83 index0 alt2code class=java keywordpublic/code 
code class=java plainString parseExample() /codecode class=java 
keywordthrows/code code class=java plainIOException, SAXException, 
TikaException {/code/divdiv class=line number84 index1 alt1code 
class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java 
plainInputStream stream = ParsingExample./codecode class=java 
keywordclass/codec
 ode class=java plain.getResourceAsStream(/codecode class=java 
stringtest.doc/codecode class=java plain);/code/divdiv 
class=line number85 index2 alt2code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java 
plainAutoDetectParser parser = /codecode class=java keywordnew/code 
code class=java plainAutoDetectParser();/code/divdiv class=line 
number86 index3 alt1code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java 
plainBodyContentHandler handler = /codecode class=java 
keywordnew/code code class=java 
plainBodyContentHandler();/code/divdiv class=line number87 index4 
alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode 
class=java plainMetadata metadata = /codecode class=java 
keywordnew/code code class=java plainMetadata();/code/divdiv 
class=line number88 index5 alt1code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java keywordtr
 y/code code class=java plain{/code/divdiv class=line number89 
index6 alt2code class=java 
spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode 
class=java plainparser.parse(stream, handler, metadata);/code/divdiv 
class=line number90 index7 alt1code 

svn commit: r1687946 [4/4] - in /tika/site/publish: 0.10/parser.html 1.10/examples.html 1.3/parser.html 1.4/parser.html 1.5/parser.html 1.6/parser.html 1.7/examples.html 1.7/parser.html 1.8/examples.h

2015-06-27 Thread nick
Modified: tika/site/publish/1.9/examples.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.9/examples.html?rev=1687946r1=1687945r2=1687946view=diff
==
--- tika/site/publish/1.9/examples.html (original)
+++ tika/site/publish/1.9/examples.html Sat Jun 27 16:15:19 2015
@@ -113,23 +113,23 @@
 pTika provides a number of different ways to parse a file. These provide 
different levels of control, flexibility, and complexity./p
 div class=section
 h4a name=Parsing_using_the_Tika_FacadeParsing using the Tika 
Facade/a/h4
-pThe a href=./apidocs/org/apache/tika/Tika.htmlTika facade/a, provides 
a number of very quick and easy ways to have your content parsed by Tika, and 
return the resulting plain text/pstyle type=text/css
+pThe a href=./api/org/apache/tika/Tika.htmlTika facade/a, provides a 
number of very quick and easy ways to have your content parsed by Tika, and 
return the resulting plain text/pstyle type=text/css
@import url('attached-includes/css/shCoreDefault.css');
 /style
 div id=highlighter_294673 class=syntaxhighlighter nogutter  javatable 
border=0 cellpadding=0 cellspacing=0tbodytrtd class=codediv 
class=containerdiv class=line number49 index0 alt2code class=java 
keywordpublic/code code class=java plainString parseToStringExample() 
/codecode class=java keywordthrows/code code class=java 
plainIOException, SAXException, TikaException {/code/divdiv class=line 
number50 index1 alt1code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainInputStream 
stream = ParsingExample./codecode class=java keywordclass/codecode 
class=java plain.getResourceAsStream(/codecode class=java 
stringtest.doc/codecode class=java plain);/code/divdiv 
class=line number51 index2 alt2code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainTika tika = 
/codecode class=java keywordnew/code code class=java 
plainTika();/code/div
 div class=line number52 index3 alt1code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java keywordtry/code 
code class=java plain{/code/divdiv class=line number53 index4 
alt2code class=java 
spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode 
class=java keywordreturn/code code class=java 
plaintika.parseToString(stream);/code/divdiv class=line number54 index5 
alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode 
class=java plain} /codecode class=java keywordfinally/code code 
class=java plain{/code/divdiv class=line number55 index6 alt2code 
class=java 
spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode 
class=java plainstream.close();/code/divdiv class=line number56 index7 
alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode 
class=java plain}/code/divdiv class=line number57 index8 alt2code 
class=java plain}/code/div/
 div/td/tr/tbody/table/div/div
 div class=section
 h4a name=Parsing_using_the_Auto-Detect_ParserParsing using the 
Auto-Detect Parser/a/h4
-pFor more control, you can call the a 
href=./apidocs/org/apache/tika/parser/Parser.htmlTika Parsers/a directly. 
Most likely, you'll want to start out using the a 
href=./apidocs/org/apache/tika/parser/AutoDetectParser.htmlAuto-Detect 
Parser/a, which automatically figures out what kind of content you have, then 
calls the appropriate parser for you./pdiv id=highlighter_420078 
class=syntaxhighlighter nogutter  javatable border=0 cellpadding=0 
cellspacing=0tbodytrtd class=codediv class=containerdiv 
class=line number83 index0 alt2code class=java keywordpublic/code 
code class=java plainString parseExample() /codecode class=java 
keywordthrows/code code class=java plainIOException, SAXException, 
TikaException {/code/divdiv class=line number84 index1 alt1code 
class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java 
plainInputStream stream = ParsingExample./codecode class=java 
keywordclass/codec
 ode class=java plain.getResourceAsStream(/codecode class=java 
stringtest.doc/codecode class=java plain);/code/divdiv 
class=line number85 index2 alt2code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java 
plainAutoDetectParser parser = /codecode class=java keywordnew/code 
code class=java plainAutoDetectParser();/code/divdiv class=line 
number86 index3 alt1code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java 
plainBodyContentHandler handler = /codecode class=java 
keywordnew/code code class=java 
plainBodyContentHandler();/code/divdiv class=line number87 index4 
alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode 
class=java plainMetadata metadata = /codecode class=java 
keywordnew/code code class=java plainMetadata();/code/divdiv 
class=line number88 index5 alt1code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java keywordtr
 y/code code class=java plain{/code/divdiv class=line number89 
index6 alt2code class=java 
spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode 
class=java plainparser.parse(stream, handler, metadata);/code/divdiv 
class=line number90 index7 alt1code 

svn commit: r1687946 [3/4] - in /tika/site/publish: 0.10/parser.html 1.10/examples.html 1.3/parser.html 1.4/parser.html 1.5/parser.html 1.6/parser.html 1.7/examples.html 1.7/parser.html 1.8/examples.h

2015-06-27 Thread nick
Modified: tika/site/publish/1.8/examples.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.8/examples.html?rev=1687946r1=1687945r2=1687946view=diff
==
--- tika/site/publish/1.8/examples.html (original)
+++ tika/site/publish/1.8/examples.html Sat Jun 27 16:15:19 2015
@@ -112,23 +112,23 @@
 pTika provides a number of different ways to parse a file. These provide 
different levels of control, flexibility, and complexity./p
 div class=section
 h4a name=Parsing_using_the_Tika_FacadeParsing using the Tika 
Facade/a/h4
-pThe a href=./apidocs/org/apache/tika/Tika.htmlTika facade/a, provides 
a number of very quick and easy ways to have your content parsed by Tika, and 
return the resulting plain text/pstyle type=text/css
+pThe a href=./api/org/apache/tika/Tika.htmlTika facade/a, provides a 
number of very quick and easy ways to have your content parsed by Tika, and 
return the resulting plain text/pstyle type=text/css
@import url('attached-includes/css/shCoreDefault.css');
 /style
 div id=highlighter_489927 class=syntaxhighlighter nogutter  javatable 
border=0 cellpadding=0 cellspacing=0tbodytrtd class=codediv 
class=containerdiv class=line number49 index0 alt2code class=java 
keywordpublic/code code class=java plainString parseToStringExample() 
/codecode class=java keywordthrows/code code class=java 
plainIOException, SAXException, TikaException {/code/divdiv class=line 
number50 index1 alt1code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainInputStream 
stream = ParsingExample./codecode class=java keywordclass/codecode 
class=java plain.getResourceAsStream(/codecode class=java 
stringtest.doc/codecode class=java plain);/code/divdiv 
class=line number51 index2 alt2code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java plainTika tika = 
/codecode class=java keywordnew/code code class=java 
plainTika();/code/div
 div class=line number52 index3 alt1code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java keywordtry/code 
code class=java plain{/code/divdiv class=line number53 index4 
alt2code class=java 
spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode 
class=java keywordreturn/code code class=java 
plaintika.parseToString(stream);/code/divdiv class=line number54 index5 
alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode 
class=java plain} /codecode class=java keywordfinally/code code 
class=java plain{/code/divdiv class=line number55 index6 alt2code 
class=java 
spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode 
class=java plainstream.close();/code/divdiv class=line number56 index7 
alt1code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode 
class=java plain}/code/divdiv class=line number57 index8 alt2code 
class=java plain}/code/div/
 div/td/tr/tbody/table/div/div
 div class=section
 h4a name=Parsing_using_the_Auto-Detect_ParserParsing using the 
Auto-Detect Parser/a/h4
-pFor more control, you can call the a 
href=./apidocs/org/apache/tika/parser/Parser.htmlTika Parsers/a directly. 
Most likely, you'll want to start out using the a 
href=./apidocs/org/apache/tika/parser/AutoDetectParser.htmlAuto-Detect 
Parser/a, which automatically figures out what kind of content you have, then 
calls the appropriate parser for you./pdiv id=highlighter_934037 
class=syntaxhighlighter nogutter  javatable border=0 cellpadding=0 
cellspacing=0tbodytrtd class=codediv class=containerdiv 
class=line number83 index0 alt2code class=java keywordpublic/code 
code class=java plainString parseExample() /codecode class=java 
keywordthrows/code code class=java plainIOException, SAXException, 
TikaException {/code/divdiv class=line number84 index1 alt1code 
class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java 
plainInputStream stream = ParsingExample./codecode class=java 
keywordclass/codec
 ode class=java plain.getResourceAsStream(/codecode class=java 
stringtest.doc/codecode class=java plain);/code/divdiv 
class=line number85 index2 alt2code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java 
plainAutoDetectParser parser = /codecode class=java keywordnew/code 
code class=java plainAutoDetectParser();/code/divdiv class=line 
number86 index3 alt1code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java 
plainBodyContentHandler handler = /codecode class=java 
keywordnew/code code class=java 
plainBodyContentHandler();/code/divdiv class=line number87 index4 
alt2code class=java spacesnbsp;nbsp;nbsp;nbsp;/codecode 
class=java plainMetadata metadata = /codecode class=java 
keywordnew/code code class=java plainMetadata();/code/divdiv 
class=line number88 index5 alt1code class=java 
spacesnbsp;nbsp;nbsp;nbsp;/codecode class=java keywordtr
 y/code code class=java plain{/code/divdiv class=line number89 
index6 alt2code class=java 
spacesnbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;nbsp;/codecode 
class=java plainparser.parse(stream, handler, metadata);/code/divdiv 
class=line number90 index7 alt1code 

svn commit: r1687946 [1/4] - in /tika/site/publish: 0.10/parser.html 1.10/examples.html 1.3/parser.html 1.4/parser.html 1.5/parser.html 1.6/parser.html 1.7/examples.html 1.7/parser.html 1.8/examples.h

2015-06-27 Thread nick
Author: nick
Date: Sat Jun 27 16:15:19 2015
New Revision: 1687946

URL: http://svn.apache.org/r1687946
Log:
Tika javadocs are in /api/ not /apidocs/, correct links

Modified:
tika/site/publish/0.10/parser.html
tika/site/publish/1.10/examples.html
tika/site/publish/1.3/parser.html
tika/site/publish/1.4/parser.html
tika/site/publish/1.5/parser.html
tika/site/publish/1.6/parser.html
tika/site/publish/1.7/examples.html
tika/site/publish/1.7/parser.html
tika/site/publish/1.8/examples.html
tika/site/publish/1.8/parser.html
tika/site/publish/1.9/examples.html
tika/site/publish/1.9/parser.html

Modified: tika/site/publish/0.10/parser.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/0.10/parser.html?rev=1687946r1=1687945r2=1687946view=diff
==
--- tika/site/publish/0.10/parser.html (original)
+++ tika/site/publish/0.10/parser.html Sat Jun 27 16:15:19 2015
@@ -131,7 +131,7 @@ try {
 ...
   lt;/bodygt;
 lt;/htmlgt;/pre/div
-pParser implementations typically use the a 
href=./apidocs/org/apache/tika/sax/XHTMLContentHandler.htmlXHTMLContentHandler/a
 utility class to generate the XHTML output./p
+pParser implementations typically use the a 
href=./api/org/apache/tika/sax/XHTMLContentHandler.htmlXHTMLContentHandler/a
 utility class to generate the XHTML output./p
 pDealing with the raw SAX events can be a bit complex, so Apache Tika comes 
with a number of utility classes that can be used to process and convert the 
event stream to other representations./p
 pFor example, the a 
href=./api/org/apache/tika/sax/BodyContentHandler.htmlBodyContentHandler/a 
class can be used to extract just the body part of the XHTML output and feed it 
either as SAX events to another content handler or as characters to an output 
stream, a writer, or simply a string. The following code snippet parses a 
document from the standard input stream and outputs the extracted text content 
to standard output:/p
 div
@@ -173,7 +173,7 @@ try {
 h3Parser implementationsa name=Parser_implementations/a/h3
 pApache Tika comes with a number of parser classes for parsing a 
href=./formats.htmlvarious document formats/a. You can also extend Tika 
with your own parsers, and of course any contributions to Tika are warmly 
welcome./p
 pThe goal of Tika is to reuse existing parser libraries like a 
class=externalLink href=http://pdfbox.apache.org/;PDFBox/a or a 
class=externalLink href=http://poi.apache.org/;Apache POI/a as much as 
possible, and so most of the parser classes in Tika are adapters to such 
external libraries./p
-pTika also contains some general purpose parser implementations that are not 
targeted at any specific document formats. The most notable of these is the a 
href=./apidocs/org/apache/tika/parser/AutoDetectParser.htmlAutoDetectParser/a
 class that encapsulates all Tika functionality into a single parser that can 
handle any types of documents. This parser will automatically determine the 
type of the incoming document based on various heuristics and will then parse 
the document accordingly./p/div/div
+pTika also contains some general purpose parser implementations that are not 
targeted at any specific document formats. The most notable of these is the a 
href=./api/org/apache/tika/parser/AutoDetectParser.htmlAutoDetectParser/a 
class that encapsulates all Tika functionality into a single parser that can 
handle any types of documents. This parser will automatically determine the 
type of the incoming document based on various heuristics and will then parse 
the document accordingly./p/div/div
   /div
   div id=sidebar
 div id=navigation

Modified: tika/site/publish/1.10/examples.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.10/examples.html?rev=1687946r1=1687945r2=1687946view=diff
==
--- tika/site/publish/1.10/examples.html (original)
+++ tika/site/publish/1.10/examples.html Sat Jun 27 16:15:19 2015
@@ -113,23 +113,23 @@
 pTika provides a number of different ways to parse a file. These provide 
different levels of control, flexibility, and complexity./p
 div class=section
 h4a name=Parsing_using_the_Tika_FacadeParsing using the Tika 
Facade/a/h4
-pThe a href=./apidocs/org/apache/tika/Tika.htmlTika facade/a, provides 
a number of very quick and easy ways to have your content parsed by Tika, and 
return the resulting plain text/pstyle type=text/css
+pThe a href=./api/org/apache/tika/Tika.htmlTika facade/a, provides a 
number of very quick and easy ways to have your content parsed by Tika, and 
return the resulting plain text/pstyle type=text/css
@import url('attached-includes/css/shCoreDefault.css');
 /style
 div id=highlighter_177280 class=syntaxhighlighter nogutter  javatable 
border=0 cellpadding=0 cellspacing=0tbodytrtd class=codediv 
class=containerdiv class=line number49 index0 alt2code

svn commit: r1687102 - in /tika/site/src/site/apt/1.10: ./ examples.apt formats.apt

2015-06-23 Thread nick
Author: nick
Date: Tue Jun 23 17:29:17 2015
New Revision: 1687102

URL: http://svn.apache.org/r1687102
Log:
Start tracking formats and examples for 1.10

Added:
tika/site/src/site/apt/1.10/
tika/site/src/site/apt/1.10/examples.apt
  - copied unchanged from r1687095, tika/site/src/site/apt/1.9/examples.apt
tika/site/src/site/apt/1.10/formats.apt
  - copied, changed from r1687095, tika/site/src/site/apt/1.9/formats.apt

Copied: tika/site/src/site/apt/1.10/formats.apt (from r1687095, 
tika/site/src/site/apt/1.9/formats.apt)
URL: 
http://svn.apache.org/viewvc/tika/site/src/site/apt/1.10/formats.apt?p2=tika/site/src/site/apt/1.10/formats.aptp1=tika/site/src/site/apt/1.9/formats.aptr1=1687095r2=1687102rev=1687102view=diff
==
--- tika/site/src/site/apt/1.9/formats.apt (original)
+++ tika/site/src/site/apt/1.10/formats.apt Tue Jun 23 17:29:17 2015
@@ -20,7 +20,7 @@
 Supported Document Formats
 
This page lists all the document formats supported by the parsers in
-   Apache Tika 1.9. Follow the links to the various parser class javadocs 
+   Apache Tika 1.10. Follow the links to the various parser class javadocs 
for more detailed information about each document format and how it is 
parsed by Tika.
 
@@ -248,6 +248,10 @@ Supported Document Formats
is able to extract attribute metadata from the ISO-19139 georgraphic 
information file format.
 
+   The {{{./api/org/apache/tika/parser/geo/topic/GeoParser.html}GeoParser}}
+   is makes use of a pre-built collection of a geographic gazetteer, to 
+   resolve geographic entities into their positions into the metadata
+
The {{{./api/org/apache/tika/parser/grib/GribParser.html}GribParser}}
is able to extract attribute metadata from the Grib scientific file format.
 




svn commit: r1686199 - /tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

2015-06-18 Thread nick
Author: nick
Date: Thu Jun 18 12:06:20 2015
New Revision: 1686199

URL: http://svn.apache.org/r1686199
Log:
Add a mime type definition for Java properties files, after a discussion on 
stackoverflow showed we didn't have one

Modified:

tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

Modified: 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1686199r1=1686198r2=1686199view=diff
==
--- 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
(original)
+++ 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
Thu Jun 18 12:06:20 2015
@@ -5392,7 +5392,6 @@
 glob pattern=*.pod/
 glob pattern=*.pom/
 glob pattern=*.project/
-glob pattern=*.properties/
 glob pattern=*.rng/
 glob pattern=*.rnx/
 glob pattern=*.roles/
@@ -5735,6 +5734,14 @@
 sub-class-of type=text/plain/
   /mime-type
 
+  mime-type type=text/x-java-properties
+_commentJava Properties/_comment
+alias type=text/x-properties /
+alias type=text/properties /
+glob pattern=*.properties/
+sub-class-of type=text/plain/
+  /mime-type
+
   mime-type type=text/x-jsp
 _commentJava Server Page/_comment
 alias type=application/x-httpd-jsp/




svn commit: r1686315 - in /tika/trunk/tika-parsers/src/test: java/org/apache/tika/mime/TestMimeTypes.java resources/test-documents/testJAVAPROPS.properties

2015-06-18 Thread nick
Author: nick
Date: Thu Jun 18 20:12:54 2015
New Revision: 1686315

URL: http://svn.apache.org/r1686315
Log:
TIKA-1660 Java Properties sample file and detection test, follows on from 
r1686199

Added:

tika/trunk/tika-parsers/src/test/resources/test-documents/testJAVAPROPS.properties
Modified:

tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1686315r1=1686314r2=1686315view=diff
==
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
(original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
Thu Jun 18 20:12:54 2015
@@ -944,6 +944,9 @@ public class TestMimeTypes {
 assertTypeByData(text/x-csrc, testC.c);
 assertTypeByData(text/x-chdr, testH.h);
 
+assertTypeByName(text/x-java-source, testJAVA.java);
+assertType(text/x-java-properties, testJAVAPROPS.properties);
+
 assertType(text/x-matlab, testMATLAB.m);
 assertType(text/x-matlab, testMATLAB_wtsgaus.m);
 assertType(text/x-matlab, testMATLAB_barcast.m);
@@ -970,6 +973,7 @@ public class TestMimeTypes {
 private void assertType(String expected, String filename) throws Exception 
{
 InputStream stream = TestMimeTypes.class.getResourceAsStream(
 /test-documents/ + filename);
+assertNotNull(Test file not found:  + filename, stream);
 try {
 Metadata metadata = new Metadata();
 metadata.set(Metadata.RESOURCE_NAME_KEY, filename);

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testJAVAPROPS.properties
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testJAVAPROPS.properties?rev=1686315view=auto
==
--- 
tika/trunk/tika-parsers/src/test/resources/test-documents/testJAVAPROPS.properties
 (added)
+++ 
tika/trunk/tika-parsers/src/test/resources/test-documents/testJAVAPROPS.properties
 Thu Jun 18 20:12:54 2015
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the License); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+tika=great
+file.content.detection=often.hard
+properties\:files=fun with special characters...
+
+# Logs please!
+log4j.appender.stdout=org.apache.log4j.ConsoleAppender
+log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
+log4j.appender.stdout.layout.ConversionPattern=%5p [%t] (%F:%L) - %m%n




svn commit: r1684187 - in /tika/trunk: tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java

2015-06-08 Thread nick
Author: nick
Date: Mon Jun  8 13:55:16 2015
New Revision: 1684187

URL: http://svn.apache.org/r1684187
Log:
Improve how the Tika CLI reports decorated parsers in --list-parsers

Modified:
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java

tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java

Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1684187r1=1684186r2=1684187view=diff
==
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
(original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Mon Jun  
8 13:55:16 2015
@@ -702,12 +702,20 @@ public class TikaCLI {
 }
  
 private void displayParser(Parser p, boolean includeMimeTypes, boolean 
apt, int i) {
+String decorated = null;
+if (p instanceof ParserDecorator) {
+ParserDecorator pd = (ParserDecorator)p;
+decorated =  (Wrapped by  + pd.getDecorationName() + );
+p = pd.getWrappedParser();
+}
+
 boolean isComposite = (p instanceof CompositeParser);
-String name = (p instanceof ParserDecorator) ?
-  ((ParserDecorator) 
p).getWrappedParser().getClass().getName() :
-  p.getClass().getName();
-if (apt){
+String name = p.getClass().getName();
+  
+if (apt) {
 name = name.substring(0, name.lastIndexOf(.) + 1) + {{{./api/ 
+ name.replace(., /) + } + name.substring(name.lastIndexOf(.) + 1) + 
}};
+} else if (decorated != null) {
+name += decorated;
 }
 if ((apt  !isComposite) || !apt) {// Don't display Composite 
parsers in the apt output.
 System.out.println(indent(i) + ((apt) ? *  : ) + name + 
(isComposite ?  (Composite Parser): : ));

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java?rev=1684187r1=1684186r2=1684187view=diff
==
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java 
(original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java 
Mon Jun  8 13:55:16 2015
@@ -56,6 +56,10 @@ public class ParserDecorator extends Abs
 public SetMediaType getSupportedTypes(ParseContext context) {
 return types;
 }
+@Override
+public String getDecorationName() {
+return With Types;
+}
 };
 }
 
@@ -81,6 +85,10 @@ public class ParserDecorator extends Abs
 // Return whatever is left
 return parserTypes;
 }
+@Override
+public String getDecorationName() {
+return Without Types;
+}
 };
 }
 
@@ -125,6 +133,10 @@ public class ParserDecorator extends Abs
 tstream.reset();
 }
 }
+@Override
+public String getDecorationName() {
+return With Fallback;
+}
 };
 }
 
@@ -163,6 +175,12 @@ public class ParserDecorator extends Abs
 parser.parse(stream, handler, metadata, context);
 }
 
+/**
+ * @return A name/description of the decoration, or null if none available
+ */
+public String getDecorationName() {
+return null;
+}
 
 /**
  * Gets the parser wrapped by this ParserDecorator




svn commit: r1684170 - in /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes: CTAKESAnnotationProperty.java CTAKESConfig.java CTAKESContentHandler.java CTAKESParser.java CTAKESSerial

2015-06-08 Thread nick
Author: nick
Date: Mon Jun  8 12:25:15 2015
New Revision: 1684170

URL: http://svn.apache.org/r1684170
Log:
Fix indents to match http://tika.apache.org/contribute.html#Code_Formatting 
TIKA-1642

Modified:

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java?rev=1684170r1=1684169r2=1684170view=diff
==
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java
 Mon Jun  8 12:25:15 2015
@@ -23,24 +23,24 @@ import org.apache.ctakes.typesystem.type
  *
  */
 public enum CTAKESAnnotationProperty {
-   BEGIN(start),
-   END(end),
-   CONDITIONAL(conditional),
-   CONFIDENCE(confidence),
-   DISCOVERY_TECNIQUE(discoveryTechnique),
-   GENERIC(generic),
-   HISTORY_OF(historyOf),
-   ID(id),
-   ONTOLOGY_CONCEPT_ARR(ontologyConceptArr),
-   POLARITY(polarity);
-   
-   private String name;
-   
-   CTAKESAnnotationProperty(String name) {
-   this.name = name;
-   }
-   
-   public String getName() {
-   return name;
-   }
+BEGIN(start),
+END(end),
+CONDITIONAL(conditional),
+CONFIDENCE(confidence),
+DISCOVERY_TECNIQUE(discoveryTechnique),
+GENERIC(generic),
+HISTORY_OF(historyOf),
+ID(id),
+ONTOLOGY_CONCEPT_ARR(ontologyConceptArr),
+POLARITY(polarity);
+
+private String name;
+
+CTAKESAnnotationProperty(String name) {
+this.name = name;
+}
+
+public String getName() {
+return name;
+}
 }
\ No newline at end of file

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java?rev=1684170r1=1684169r2=1684170view=diff
==
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java
 Mon Jun  8 12:25:15 2015
@@ -24,314 +24,313 @@ import java.util.Properties;
 
 import org.apache.tika.io.NullOutputStream;
 
-/*
+/**
  * Configuration for {@see CTAKESContentHandler}.
  * 
  * This class allows to enable cTAKES and set its parameters.
- * 
  */
 public class CTAKESConfig implements Serializable {
-   /**
-* Serial version UID
-*/
-   private static final long serialVersionUID = -1599741171775528923L;
-   
-   // Path to XML descriptor for AnalysisEngine
-   private String aeDescriptorPath = 
/ctakes-core/desc/analysis_engine/SentencesAndTokensAggregate.xml;
-   
-   // UMLS username
-   private String UMLSUser = ;
-   
-   // UMLS password
-   private String UMLSPass = ;
-   
-   // Enables formatted output
-   private boolean prettyPrint = true; 
-   
-   // Type of cTAKES (UIMA) serializer
-   private CTAKESSerializer serializerType = CTAKESSerializer.XMI;
-   
-   // OutputStream object used for CAS serialization
-   private OutputStream stream = NullOutputStream.NULL_OUTPUT_STREAM;
-   
-   // Enables CAS serialization
-   private boolean serialize = false;
-   
-   // Enables text analysis using cTAKES
-   private boolean text = true;
-   
-   // List of metadata to analyze using cTAKES
-   private String[] metadata = null;
-   
-   // List of annotation properties to add to metadata in addition to text 
covered by an annotation
-   private CTAKESAnnotationProperty[] annotationProps = null;
-   
-   // Character used to separate the annotation properties into metadata
-   private char separatorChar = ':';
-
-   /**
-* Default constructor.
-*/
-   public CTAKESConfig() {
-   
init(this.getClass().getResourceAsStream(CTAKESConfig.properties));
-   }
-   
-   /**
-* Loads properties from InputStream

svn commit: r1684201 - /tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaParsers.java

2015-06-08 Thread nick
Author: nick
Date: Mon Jun  8 15:05:20 2015
New Revision: 1684201

URL: http://svn.apache.org/r1684201
Log:
Make the nesting more visually obvious in the Server HTML parsers listing

Modified:

tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaParsers.java

Modified: 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaParsers.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaParsers.java?rev=1684201r1=1684200r2=1684201view=diff
==
--- 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaParsers.java
 (original)
+++ 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaParsers.java
 Mon Jun  8 15:05:20 2015
@@ -96,9 +96,11 @@ public class TikaParsers {
 }
 if (p.isComposite) {
 html.append(pComposite Parser/p);
+html.append(div style=\margin-left: 1em\\n);
 for (Parser cp : p.childParsers) {
 parserAsHTML(new ParserDetails(cp), withMimeTypes, html, level 
+ 1);
 }
+html.append(/div\n);
 } else if (withMimeTypes) {
 html.append(pMime Types:);
 html.append(ul);
@@ -110,6 +112,7 @@ public class TikaParsers {
 html.append(/ul);
 html.append(/p);
 }
+html.append(\n);
 }
 
 @GET




svn commit: r1684206 - in /tika/trunk/tika-core/src/main/java/org/apache/tika: config/TikaConfig.java parser/ParserDecorator.java

2015-06-08 Thread nick
Author: nick
Date: Mon Jun  8 15:28:45 2015
New Revision: 1684206

URL: http://svn.apache.org/r1684206
Log:
Allow Tika Config xml to have a ParserDecorator with child parsers, and note 
about how this can work in the javadocs

Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java

tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1684206r1=1684205r2=1684206view=diff
==
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
Mon Jun  8 15:28:45 2015
@@ -381,8 +381,10 @@ public class TikaConfig {
 +  configuration element:  + name);
 }
 
-// Is this a composite parser? If so, support recursion
-if (CompositeParser.class.isAssignableFrom(parserClass)) {
+// Is this a composite or decorated parser? If so, support 
recursion
+if (CompositeParser.class.isAssignableFrom(parserClass) ||
+ParserDecorator.class.isAssignableFrom(parserClass)) {
+
 // Get the child parsers for it
 ListParser childParsers = new ArrayListParser();
 NodeList childParserNodes = 
parserNode.getElementsByTagName(parser);
@@ -407,20 +409,36 @@ public class TikaConfig {
 
 // Create the Composite Parser
 Constructor? extends Parser c = null;
-if (c == null) {
+MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
+if (parser == null) {
 try {
 c = 
parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class, 
Collection.class);
-parser = 
c.newInstance(mimeTypes.getMediaTypeRegistry(), loader, excludeParsers);
+parser = c.newInstance(registry, loader, 
excludeParsers);
 } 
 catch (NoSuchMethodException me) {}
 }
-if (c == null) {
+if (parser == null) {
 try {
 c = 
parserClass.getConstructor(MediaTypeRegistry.class, List.class, 
Collection.class);
-parser = 
c.newInstance(mimeTypes.getMediaTypeRegistry(), childParsers, excludeParsers);
+parser = c.newInstance(registry, childParsers, 
excludeParsers);
+} catch (NoSuchMethodException me) {}
+}
+// Create as a Parser Decorator
+if (parser == null  
ParserDecorator.class.isAssignableFrom(parserClass)) {
+try {
+CompositeParser cp = null;
+if (childParsers.size() == 1  excludeParsers.size() 
== 0 
+childParsers.get(0) instanceof 
CompositeParser) {
+cp = (CompositeParser)childParsers.get(0);
+} else {
+cp = new CompositeParser(registry, childParsers, 
excludeParsers);
+}
+c = parserClass.getConstructor(Parser.class);
+parser = c.newInstance(cp);
 } catch (NoSuchMethodException me) {}
 }
-if (c == null) {
+// Default constructor
+if (parser == null) {
 parser = parserClass.newInstance();
 }
 } else {

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java?rev=1684206r1=1684205r2=1684206view=diff
==
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java 
(original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java 
Mon Jun  8 15:28:45 2015
@@ -30,10 +30,12 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 /**
- * Decorator base class for the {@link Parser} interface. This class
- * simply delegates all parsing calls to an underlying decorated parser
- * instance. Subclasses can provide extra decoration by overriding the
+ * Decorator base class for the {@link Parser} interface. 
+ * pThis class simply delegates all parsing calls to an underlying decorated 
+ * parser instance. Subclasses can provide extra decoration by overriding the
  * parse method.
+ * pTo

svn commit: r1684207 - in /tika/trunk: tika-core/src/test/resources/org/apache/tika/config/TIKA-1642-CTakes.xml tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java

2015-06-08 Thread nick
Author: nick
Date: Mon Jun  8 15:29:26 2015
New Revision: 1684207

URL: http://svn.apache.org/r1684207
Log:
cTAKES config xml example and code example in JavaDocs TIKA-1642

Added:

tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1642-CTakes.xml
Modified:

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java

Added: 
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1642-CTakes.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1642-CTakes.xml?rev=1684207view=auto
==
--- 
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1642-CTakes.xml
 (added)
+++ 
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1642-CTakes.xml
 Mon Jun  8 15:29:26 2015
@@ -0,0 +1,24 @@
+?xml version=1.0 encoding=UTF-8?
+!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the License); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an AS IS BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--
+properties
+  parsers
+parser class=org.apache.tika.parser.ctakes.CTAKESParser
+   parser class=org.apache.tika.parser.DefaultParser/
+/parser
+  /parsers
+/properties

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java?rev=1684207r1=1684206r2=1684207view=diff
==
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
 Mon Jun  8 15:29:26 2015
@@ -35,7 +35,7 @@ import org.xml.sax.SAXException;
  * clinical text using Apache cTAKES.
  * pIt is normally called by supplying an instance to 
  *  {@link AutoDetectParser}, such as:
- * codeAutoDetectParser parser = new AutoDetectParser(new 
CTakesParser());/code
+ * codeAutoDetectParser parser = new AutoDetectParser(new 
CTAKESParser());/code
  * pIt can also be used by giving a Tika Config file similar to:
  * code
  *  gt;properties




svn commit: r1684199 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/config/ tika-core/src/test/java/org/apache/tika/config/ tika-core/src/test/resources/org/apache/tika/config/ tika-parsers

2015-06-08 Thread nick
Author: nick
Date: Mon Jun  8 14:41:48 2015
New Revision: 1684199

URL: http://svn.apache.org/r1684199
Log:
TIKA-1653 Re-do the XML parsing in the Tika Config, so that a parser tag with 
another inside it doesn't get accidently duplicated at the top level

Added:

tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1653-norepeat.xml
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java

tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1684199r1=1684198r2=1684199view=diff
==
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
Mon Jun  8 14:41:48 2015
@@ -332,10 +332,25 @@ public class TikaConfig {
 Element element, MimeTypes mimeTypes, ServiceLoader loader)
 throws TikaException, IOException {
 ListParser parsers = new ArrayListParser();
-NodeList nodes = element.getElementsByTagName(parser);
-for (int i = 0; i  nodes.getLength(); i++) {
-Element node = (Element) nodes.item(i);
-parsers.add(parserFromParserDomElement(node, mimeTypes, loader));
+
+// Should be only zero or one parsers tag
+NodeList nodes = element.getElementsByTagName(parsers);
+if (nodes.getLength()  1) {
+throw new TikaException(Properties may not contain multiple 
Parsers entries);
+}
+else if (nodes.getLength() == 1) {
+// Find only the direct child parser objects
+Node parsersE = nodes.item(0);
+nodes = parsersE.getChildNodes();
+for (int i = 0; i  nodes.getLength(); i++) {
+Node node = nodes.item(i);
+if (node instanceof Element) {
+Element nodeE = (Element)node;
+if (parser.equals(nodeE.getTagName())) {
+parsers.add(parserFromParserDomElement(nodeE, 
mimeTypes, loader));
+}
+}
+}
 }
 
 if (parsers.isEmpty()) {
@@ -444,21 +459,26 @@ public class TikaConfig {
 private static SetMediaType mediaTypesListFromDomElement(
 Element node, String tag) 
 throws TikaException, IOException {
-NodeList mimes = node.getElementsByTagName(tag);
-if (mimes.getLength()  0) {
-SetMediaType types = new HashSetMediaType();
-for (int j = 0; j  mimes.getLength(); j++) {
-String mime = getText(mimes.item(j));
-MediaType type = MediaType.parse(mime);
-if (type != null) {
-types.add(type);
-} else {
-throw new TikaException(
-Invalid media type name:  + mime);
+SetMediaType types = null;
+NodeList children = node.getChildNodes();
+for (int i=0; ichildren.getLength(); i++) {
+Node cNode = children.item(i);
+if (cNode instanceof Element) {
+Element cElement = (Element)cNode;
+if (tag.equals(cElement.getTagName())) {
+String mime = getText(cElement);
+MediaType type = MediaType.parse(mime);
+if (type != null) {
+if (types == null) types = new HashSetMediaType();
+types.add(type);
+} else {
+throw new TikaException(
+Invalid media type name:  + mime);
+}
 }
 }
-return types;
 }
+if (types != null) return types;
 return Collections.emptySet();
 }
 

Modified: 
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java?rev=1684199r1=1684198r2=1684199view=diff
==
--- 
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java 
(original)
+++ 
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java 
Mon Jun  8 14:41:48 2015
@@ -177,4 +177,39 @@ public class TikaConfigTest {
 System.clearProperty(tika.config);
 }
 }
+
+/**
+ * TIKA-1653 If one parser has child parsers, those child parsers shouldn't
+ *  show up at the top level as well

svn commit: r1683076 - /tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

2015-06-02 Thread nick
Author: nick
Date: Tue Jun  2 11:00:22 2015
New Revision: 1683076

URL: http://svn.apache.org/r1683076
Log:
Try to make the low-priority padded PDF magic match more specific, as it looks 
to have incorrectly triggered on a few of the govdocs text files

Modified:

tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

Modified: 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1683076r1=1683075r2=1683076view=diff
==
--- 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
(original)
+++ 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
Tue Jun  2 11:00:22 2015
@@ -489,7 +489,8 @@
 magic priority=20
   !-- Low priority match for %PDF near the start of the file --
   !-- Can trigger false positives, so set the priority rather low here --
-  match value=%PDF- type=string offset=1:512/
+  match value=%PDF-1. type=string offset=1:512/
+  match value=%PDF-2. type=string offset=1:512/
 /magic
 glob pattern=*.pdf/
   /mime-type




svn commit: r1683101 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java tika-parsers/src/test/res

2015-06-02 Thread nick
Author: nick
Date: Tue Jun  2 13:15:21 2015
New Revision: 1683101

URL: http://svn.apache.org/r1683101
Log:
Bibtex entries are case insensitive, and might start with a comment, so tweak 
magic and add a test file. (Spotted in govdocs1)

Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testBIBTEX.bib
Modified:

tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java

Modified: 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1683101r1=1683100r2=1683101view=diff
==
--- 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
(original)
+++ 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
Tue Jun  2 13:15:21 2015
@@ -487,7 +487,7 @@
   match value=\xef\xbb\xbf%PDF- type=string offset=0/
 /magic
 magic priority=20
-  !-- Low priority match for %PDF near the start of the file --
+  !-- Low priority match for %PDF-#.# near the start of the file --
   !-- Can trigger false positives, so set the priority rather low here --
   match value=%PDF-1. type=string offset=1:512/
   match value=%PDF-2. type=string offset=1:512/
@@ -2793,17 +2793,34 @@
   match value=%%%\ \  type=string offset=73/
   match value=%\ BibTeX\ standard\ bibliography\  type=string 
offset=0/
   match value=%%%\ \ @BibTeX-style-file{ type=string offset=73/
-  match value=@article{ type=string offset=0/
-  match value=@book{ type=string offset=0/
-  match value=@inbook{ type=string offset=0/
-  match value=@incollection{ type=string offset=0/
-  match value=@inproceedings{ type=string offset=0/
-  match value=@manual{ type=string offset=0/
-  match value=@misc{ type=string offset=0/
-  match value=@preamble{ type=string offset=0/
-  match value=@phdthesis{ type=string offset=0/
-  match value=@techreport{ type=string offset=0/
-  match value=@unpublished{ type=string offset=0/
+  match value=@article{ type=stringignorecase offset=0/
+  match value=@book{ type=stringignorecase offset=0/
+  match value=@inbook{ type=stringignorecase offset=0/
+  match value=@incollection{ type=stringignorecase offset=0/
+  match value=@inproceedings{ type=stringignorecase offset=0/
+  match value=@manual{ type=stringignorecase offset=0/
+  match value=@misc{ type=stringignorecase offset=0/
+  match value=@preamble{ type=stringignorecase offset=0/
+  match value=@phdthesis{ type=stringignorecase offset=0/
+  match value=@string{ type=stringignorecase offset=0/
+  match value=@techreport{ type=stringignorecase offset=0/
+  match value=@unpublished{ type=stringignorecase offset=0/
+/magic
+magic priority=30
+  match value=% type=string offset=0
+ match value=\n@article{ type=stringignorecase offset=2:128/
+ match value=\n@book{ type=stringignorecase offset=2:128/
+ match value=\n@inbook{ type=stringignorecase offset=2:128/
+ match value=\n@incollection{ type=stringignorecase 
offset=2:128/
+ match value=\n@inproceedings{ type=stringignorecase 
offset=2:128/
+ match value=\n@manual{ type=stringignorecase offset=2:128/
+ match value=\n@misc{ type=stringignorecase offset=2:128/
+ match value=\n@preamble{ type=stringignorecase offset=2:128/
+ match value=\n@phdthesis{ type=stringignorecase offset=2:128/
+ match value=\n@string{ type=stringignorecase offset=2:128/
+ match value=\n@techreport{ type=stringignorecase offset=2:128/
+ match value=\n@unpublished{ type=stringignorecase 
offset=2:128/
+  /match
 /magic
 glob pattern=*.bib/
 glob pattern=*.bibtex/

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1683101r1=1683100r2=1683101view=diff
==
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
(original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
Tue Jun  2 13:15:21 2015
@@ -932,6 +932,12 @@ public class TestMimeTypes {
 }
 
 @Test
+public void testTextFormats() throws Exception {
+assertType(application/x-bibtex-text-file, testBIBTEX.bib);
+assertTypeByData(application/x-bibtex-text-file, testBIBTEX.bib);
+}
+
+@Test
 public void testCodeFormats() throws Exception {
 assertType(text/x-csrc, testC.c);
 

Added: tika/trunk/tika-parsers/src/test

svn commit: r1683107 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java tika-parsers/src/test/res

2015-06-02 Thread nick
Author: nick
Date: Tue Jun  2 13:33:55 2015
New Revision: 1683107

URL: http://svn.apache.org/r1683107
Log:
TIKA-1634 Few more matlab and other code related tests

Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testH.h
Modified:

tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java

Modified: 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1683107r1=1683106r2=1683107view=diff
==
--- 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
(original)
+++ 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
Tue Jun  2 13:33:55 2015
@@ -5580,6 +5580,9 @@
   mime-type type=text/x-chdr
 _commentC source code header/_comment
 glob pattern=*.h/
+magic priority=30
+  match value=#ifndef  type=string offset=0/
+/magic
 sub-class-of type=text/plain/
   /mime-type
 
@@ -5598,6 +5601,9 @@
   mime-type type=text/x-csrc
 alias type=text/x-c/
 _commentC source code/_comment
+magic priority=30
+  match value=#include  type=string offset=0/
+/magic
 glob pattern=*.c/
 sub-class-of type=text/plain/
   /mime-type

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1683107r1=1683106r2=1683107view=diff
==
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
(original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
Tue Jun  2 13:33:55 2015
@@ -940,10 +940,16 @@ public class TestMimeTypes {
 @Test
 public void testCodeFormats() throws Exception {
 assertType(text/x-csrc, testC.c);
+assertType(text/x-chdr, testH.h);
+assertTypeByData(text/x-csrc, testC.c);
+assertTypeByData(text/x-chdr, testH.h);
 
 assertType(text/x-matlab, testMATLAB.m);
 assertType(text/x-matlab, testMATLAB_wtsgaus.m);
 assertType(text/x-matlab, testMATLAB_barcast.m);
+assertTypeByData(text/x-matlab, testMATLAB.m);
+assertTypeByData(text/x-matlab, testMATLAB_wtsgaus.m);
+assertTypeByData(text/x-matlab, testMATLAB_barcast.m);
 }
 
 private void assertText(byte[] prefix) throws IOException {

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testH.h
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testH.h?rev=1683107view=auto
==
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testH.h (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testH.h Tue Jun  
2 13:33:55 2015
@@ -0,0 +1,5 @@
+#ifndef TIKA_HELLO_WORLD
+#define TIKA_HELLO_WORLD
+
+#define HELLO world
+#endif




svn commit: r1681337 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/pkg/CompressorParser.java test/java/org/apache/tika/parser/pkg/ZlibParserTest.java

2015-05-23 Thread nick
Author: nick
Date: Sat May 23 12:21:05 2015
New Revision: 1681337

URL: http://svn.apache.org/r1681337
Log:
TIKA-1635 Disabled zlib parser support, not yet enabled pending a fix for a 
commons compress bug

Added:

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java
Modified:

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java?rev=1681337r1=1681336r2=1681337view=diff
==
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
 Sat May 23 12:21:05 2015
@@ -27,6 +27,7 @@ import org.apache.commons.compress.compr
 import org.apache.commons.compress.compressors.CompressorInputStream;
 import org.apache.commons.compress.compressors.CompressorStreamFactory;
 import 
org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
+import 
org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream;
 import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
 import org.apache.commons.compress.compressors.gzip.GzipUtils;
 import 
org.apache.commons.compress.compressors.pack200.Pack200CompressorInputStream;
@@ -57,6 +58,8 @@ public class CompressorParser extends Ab
 private static final MediaType GZIP_ALT = MediaType.application(x-gzip);
 private static final MediaType XZ = MediaType.application(x-xz);
 private static final MediaType PACK = 
MediaType.application(application/x-java-pack200);
+// TODO Not yet supported by CompressorStreamFactory, see COMPRESS-316
+private static final MediaType ZLIB = MediaType.application(zlib);
 
 private static final SetMediaType SUPPORTED_TYPES =
 MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, XZ, PACK);
@@ -73,6 +76,8 @@ public class CompressorParser extends Ab
 return GZIP;
 } else if (stream instanceof XZCompressorInputStream) {
 return XZ;
+} else if (stream instanceof DeflateCompressorInputStream) {
+return ZLIB;
 } else if (stream instanceof Pack200CompressorInputStream) {
 return PACK;
 } else {
@@ -133,6 +138,8 @@ public class CompressorParser extends Ab
 name = name.substring(0, name.length() - 4);
 } else if (name.endsWith(.xz)) {
 name = name.substring(0, name.length() - 3);
+} else if (name.endsWith(.zlib)) {
+name = name.substring(0, name.length() - 5);
 } else if (name.endsWith(.pack)) {
 name = name.substring(0, name.length() - 5);
 } else if (name.length()  0) {

Added: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java?rev=1681337view=auto
==
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java
 (added)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java
 Sat May 23 12:21:05 2015
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing zlib compressed
+ * 
+ * Note - currently disabled, pending a fix for COMPRESS-316
+ */
+public class ZlibParserTest extends AbstractPkgTest {
+@Test

svn commit: r1681351 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java

2015-05-23 Thread nick
Author: nick
Date: Sat May 23 14:05:20 2015
New Revision: 1681351

URL: http://svn.apache.org/r1681351
Log:
TIKA-1634 Two more kinds of matlab magic, and tests

Modified:

tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java

Modified: 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1681351r1=1681350r2=1681351view=diff
==
--- 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
(original)
+++ 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
Sat May 23 14:05:20 2015
@@ -5752,9 +5752,23 @@
 
   mime-type type=text/x-matlab
 _commentMatlab source code/_comment
+!-- Multiple-output function definition --
 magic priority=50
   match value=function [ type=string offset=0/
 /magic
+!-- Single-output or no output function definition --
+magic priority=40
+  match value=function [a-zA-Z][A-Za-z0-9_]{0,5} type=regex 
offset=0/
+/magic
+!-- Two matlab-style comments fairly early in the file --
+magic priority=25
+  match value=% type=string offset=0
+ match value=\n% type=string offset=2:120/
+  /match
+  match value=% type=string offset=0
+ match value=\r% type=string offset=2:120/
+  /match
+/magic
 !-- glob pattern=*.m/ - conflicts with text/x-objcsrc --
 sub-class-of type=text/plain/
   /mime-type

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1681351r1=1681350r2=1681351view=diff
==
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
(original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
Sat May 23 14:05:20 2015
@@ -930,6 +930,15 @@ public class TestMimeTypes {
 assertTypeByData(application/zlib, testTXT.zlib5);
 assertTypeByData(application/zlib, testTXT.zlib9);
 }
+
+@Test
+public void testCodeFormats() throws Exception {
+assertType(text/x-csrc, testC.c);
+
+assertType(text/x-matlab, testMATLAB.m);
+assertType(text/x-matlab, testMATLAB_wtsgaus.m);
+assertType(text/x-matlab, testMATLAB_barcast.m);
+}
 
 private void assertText(byte[] prefix) throws IOException {
 assertMagic(text/plain, prefix);




svn commit: r1681348 - /tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

2015-05-23 Thread nick
Author: nick
Date: Sat May 23 13:43:27 2015
New Revision: 1681348

URL: http://svn.apache.org/r1681348
Log:
Add an alternate zlib mimetype found in some places

Modified:

tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

Modified: 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1681348r1=1681347r2=1681348view=diff
==
--- 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
(original)
+++ 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
Sat May 23 13:43:27 2015
@@ -4006,6 +4006,7 @@
   /mime-type
 
   mime-type type=application/zlib
+alias type=application/x-deflate/
 _commentZLIB Compressed Data Format/_comment
 tika:linkhttp://tools.ietf.org/html/rfc1950/tika:link
 magic priority=45




svn commit: r1681349 - in /tika/trunk/tika-parsers/src/test/resources/test-documents: testMATLAB.m testMATLAB_barcast.m testMATLAB_wtsgaus.m

2015-05-23 Thread nick
Author: nick
Date: Sat May 23 13:51:55 2015
New Revision: 1681349

URL: http://svn.apache.org/r1681349
Log:
TIKA-1634 Add some sample matlab files

Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testMATLAB.m

tika/trunk/tika-parsers/src/test/resources/test-documents/testMATLAB_barcast.m

tika/trunk/tika-parsers/src/test/resources/test-documents/testMATLAB_wtsgaus.m

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testMATLAB.m
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testMATLAB.m?rev=1681349view=auto
==
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testMATLAB.m 
(added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testMATLAB.m Sat 
May 23 13:51:55 2015
@@ -0,0 +1,4 @@
+function helloworld
+fprintf('Hello, World!\n')
+disp('Hello, World!');
+end

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testMATLAB_barcast.m
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testMATLAB_barcast.m?rev=1681349view=auto
==
--- 
tika/trunk/tika-parsers/src/test/resources/test-documents/testMATLAB_barcast.m 
(added)
+++ 
tika/trunk/tika-parsers/src/test/resources/test-documents/testMATLAB_barcast.m 
Sat May 23 13:51:55 2015
@@ -0,0 +1,383 @@
+%% CONTROL CODE FOR FULLY BAYESIAN SPATIO-TEMPORAL TEMPERATURE RECONSTRUCTION
+%EVERYTHING IS MODULAR TO ALLOW FOR EASY DEBUGGING AND ADAPTATION
+% _vNewModel_Oct08: change the formalism to reflect new model (Beta_1 now
+% normal). Allows for multiple proxies
+clear all; close all;
+%SET MATLAB'S CURRENT DIRECTORY TO HERE. 
+% set the priors and the inital values for the MCMC sampler
+Prior_pars_vNewModel
+Initial_par_vals_vNewModel
+%% Set the seed of the random number generators
+randn('state', sum((1000+600)*clock))
+rand('state', sum((1000+800)*clock))
+
+%% load the data
+cd TestData
+load BARCAST_INPUT_vNewMeth1
+%break it apart
+Locs=BARCAST_INPUT.Master_Locs;
+N_Locs=length(Locs(:,1)); %Number of locations:
+timeline=[BARCAST_INPUT.Data_timeline(1)-1, BARCAST_INPUT.Data_timeline];
+N_Times=length(timeline)-1; %Number of DATA times
+loc_areas=BARCAST_INPUT.Areas;
+Inds_GridLocs_Central=BARCAST_INPUT.Inds_Central;
+
+%get the number of proxy types:
+N_PT=length(fieldnames(BARCAST_INPUT))-5;
+
+%stack the three data matrices, one on top of the other
+%the first N_Locs ROWS are the Inst, the next N_Locs ROWS the first proxy
+%type, the next the third. . . . .. Each column a year. The first
+%corresponds to the SECOND entry in timeline. 
+Data_ALL=BARCAST_INPUT.Inst_Data;
+for kk=1:1:N_PT
+tp=eval(['BARCAST_INPUT.Prox_Data', num2str(kk)]);
+Data_ALL=[Data_ALL; tp];
+end
+
+% % % % All_locs_wInd=BARCAST_INPUT.All_locs_wInd;
+% % % % lon_lat_area=BARCAST_INPUT.lon_lat_area;
+% % % % DATA_Mat=BARCAST_INPUT.DATA_Mat;
+% % % % DATA_Mat_locs=BARCAST_INPUT.DATA_Mat_locs;
+% % % % Inds_GridLocs_Central=BARCAST_INPUT.Inds_GridLocs_Central;
+% % % % timeline=BARCAST_INPUT.timeline;
+% % % % clear BARCAST_INPUT
+
+%Priors and MH jumping parameters, from Prior_pars_vNewModel
+load PRIORS_vNewMeth1
+load MHpars_vNewMeth1
+%Initial values from Initial_par_vals_vNewModel
+load INITIAL_VALS_vNewMeth1
+
+%The Order of THE SCALAR parameters WILL ALWAYS thus:
+%1 = alpha, the AR(1) coefficient
+%2 = mu, the constant par in the linear mean of the AR(1) process
+%3 = sigma2, the partial sill in the spatial covariance matrix
+%4 = phi, the range parameter in the spatial covariance matrix
+%5 = tau2_I, the Inst measurement error
+%6 = tau2_P, the measurement error, first PROX type
+%7 = Beta_1, the scaling par in the  first P observation equation
+%8 = Beta_0, the additive par in the first P observation equation
+%and, if there is second proxy type
+%9  = tau2_P_2, the measurement error, second PROX type
+%10 = Beta_1, the scaling par in the  second P observation equation
+%11 = Beta_0, the additive par in the second P observation equation
+%and, if there is third proxy type . . . . 
+
+%A NOTE ON GAMMA NOTATION. WE USE THE NOTATION OF Gelman et al, Bayesian
+%Data Analysis, WHERE GAMMA PARAMETERS ALPHA, BETA)==(SHAPE, INVERSE SCALE). 
+%THE RANDRAW.M CODE USES (A,B)==(SHAPE, SCALE), AND THE CALL IS 
RANDRAW('GAMMA', [M,B,A], SAMPLESIZE), 
+%WHERE M IS THE LOCATION (NOT NEEDED). SO IN THE NOTATION OF GELMAN ET AT, THE 
CALL IS
+%RANDRAW('GAMMA', [0,1/BETA,ALPHA], SAMPLESIZE). 
+%For example,
+%RANDRAW('GAMMA', [0,1/PRIORS.sigma2(2),PRIORS.sigma2(1)], 1), AND ETC. 
+
+%switch back tot he main directory
+cd ..
+%% SET a few parameters
+%Number of iterations of the complete sampler
+Sampler_Its=2000;
+
+%Number of times to update only the temperature array before beginning to
+%update the other parameters
+pre_Sampler_Its=500; 
+
+
+%% Areal weights vector

svn commit: r1680957 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/ tika-parsers/src/test/java/org/apache/tika/mime/ tika-parsers/src/test/java/org/apache/tika/parser/pdf/ tika-p

2015-05-21 Thread nick
Author: nick
Date: Thu May 21 21:49:11 2015
New Revision: 1680957

URL: http://svn.apache.org/r1680957
Log:
TIKA-1085 Treat a PDF with a leading Byte Order Mark the same for detection, 
and add low-priorty matches for the PDF magic coming in 1-1024 bytes of the 
start (may give false positives if too high), plus tests

Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bom.pdf   
(with props)
Modified:

tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java

Modified: 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1680957r1=1680956r2=1680957view=diff
==
--- 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
(original)
+++ 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
Thu May 21 21:49:11 2015
@@ -481,7 +481,15 @@
 
tika:linkhttp://www.adobe.com/devnet/pdf/pdf_reference_archive.html/tika:link
 tika:uticom.adobe.pdf/tika:uti
 magic priority=50
+  !-- Normally just %PDF- --
   match value=%PDF- type=string offset=0/
+  !-- Sometimes has a UTF-8 Byte Order Mark first --
+  match value=\xef\xbb\xbf%PDF- type=string offset=0/
+/magic
+magic priority=20
+  !-- Low priority match for %PDF near the start of the file --
+  !-- Can trigger false positives, so set the priority rather low here --
+  match value=%PDF- type=string offset=1:512/
 /magic
 glob pattern=*.pdf/
   /mime-type

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1680957r1=1680956r2=1680957view=diff
==
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
(original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
Thu May 21 21:49:11 2015
@@ -501,10 +501,17 @@ public class TestMimeTypes {
 
 @Test
 public void testPdfDetection() throws Exception {
-assertType(application/pdf, testPDF.pdf);
-assertTypeByData(application/pdf, testPDF.pdf);
+// PDF extension by name is enough
 assertTypeByName(application/pdf, x.pdf);
 assertTypeByName(application/pdf, x.PDF);
+
+// For normal PDFs, can get by name or data or both
+assertType(application/pdf, testPDF.pdf);
+assertTypeByData(application/pdf, testPDF.pdf);
+
+// PDF with a BoM works both ways too
+assertType(application/pdf, testPDF_bom.pdf);
+assertTypeByData(application/pdf, testPDF_bom.pdf);
 }
 
 @Test

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1680957r1=1680956r2=1680957view=diff
==
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 Thu May 21 21:49:11 2015
@@ -652,6 +652,8 @@ public class PDFParserTest extends TikaT
 knownMetadataDiffs.add(testAnnotations.pdf);
 // Added for TIKA-93.
 knownMetadataDiffs.add(testOCR.pdf);
+// Added for TIKA-1085
+knownMetadataDiffs.add(testPDF_bom.pdf);
 
 //empty for now
 SetString knownContentDiffs = new HashSetString();

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bom.pdf
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bom.pdf?rev=1680957view=auto
==
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bom.pdf
--
svn:mime-type = application/octet-stream




svn commit: r1680959 - /tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

2015-05-21 Thread nick
Author: nick
Date: Thu May 21 22:13:08 2015
New Revision: 1680959

URL: http://svn.apache.org/r1680959
Log:
TIKA-1632 zlib mime magic from Pavel Micka

Modified:

tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

Modified: 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1680959r1=1680958r2=1680959view=diff
==
--- 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
(original)
+++ 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
Thu May 21 22:13:08 2015
@@ -4005,6 +4005,19 @@
 glob pattern=*.zip/
   /mime-type
 
+  mime-type type=application/zlib
+_commentZLIB Compressed Data Format/_comment
+tika:linkhttp://tools.ietf.org/html/rfc1950/tika:link
+magic priority=45
+  !-- Low compression --
+  match value=\x78\x01 type=string offset=0 /
+  !-- Default compression --
+  match value=\x78\x9c type=string offset=0 /
+  !-- Best compression --
+  match value=\x78\xda type=string offset=0 /
+/magic
+  /mime-type
+
   mime-type type=application/x-7z-compressed
 acronym7zip/acronym
 _comment7-zip archive/_comment




svn commit: r1680358 - in /tika/site: publish/1.7/formats.html publish/1.8/formats.html publish/1.9/formats.html src/site/apt/1.7/formats.apt src/site/apt/1.8/formats.apt src/site/apt/1.9/formats.apt

2015-05-19 Thread nick
Author: nick
Date: Tue May 19 17:57:53 2015
New Revision: 1680358

URL: http://svn.apache.org/r1680358
Log:
Update the formats to make it clearer that these are the parser-supported 
formats, and more get detection-only

Modified:
tika/site/publish/1.7/formats.html
tika/site/publish/1.8/formats.html
tika/site/publish/1.9/formats.html
tika/site/src/site/apt/1.7/formats.apt
tika/site/src/site/apt/1.8/formats.apt
tika/site/src/site/apt/1.9/formats.apt

Modified: tika/site/publish/1.7/formats.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.7/formats.html?rev=1680358r1=1680357r2=1680358view=diff
==
--- tika/site/publish/1.7/formats.html (original)
+++ tika/site/publish/1.7/formats.html Tue May 19 17:57:53 2015
@@ -86,7 +86,8 @@
   div id=content
 !-- Licensed to the Apache Software Foundation (ASF) under one or 
more --!-- contributor license agreements.  See the NOTICE file distributed 
with --!-- this work for additional information regarding copyright 
ownership. --!-- The ASF licenses this file to You under the Apache License, 
Version 2.0 --!-- (the License); you may not use this file except in 
compliance with --!-- the License.  You may obtain a copy of the License at 
--!--  --!-- http://www.apache.org/licenses/LICENSE-2.0 --!--  --!-- 
Unless required by applicable law or agreed to in writing, software --!-- 
distributed under the License is distributed on an AS IS BASIS, --!-- 
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
--!-- See the License for the specific language governing permissions and 
--!-- limitations under the License. --div class=section
 h2Supported Document Formatsa name=Supported_Document_Formats/a/h2
-pThis page lists all the document formats supported by Apache Tika 1.7. 
Follow the links to the various parser class javadocs for more detailed 
information about each document format and how it is parsed by Tika./p
+pThis page lists all the document formats supported by the parsers in Apache 
Tika 1.7. Follow the links to the various parser class javadocs for more 
detailed information about each document format and how it is parsed by 
Tika./p
+p(Please note that Apache Tika is able to detect a much wider range of 
formats than those listed below, this page only documents those formats from 
which Tika is able to extract metadata and/or textual content)/p
 ul
 lia href=#Supported_Document_FormatsSupported Document Formats/a
 ul

Modified: tika/site/publish/1.8/formats.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.8/formats.html?rev=1680358r1=1680357r2=1680358view=diff
==
--- tika/site/publish/1.8/formats.html (original)
+++ tika/site/publish/1.8/formats.html Tue May 19 17:57:53 2015
@@ -86,7 +86,8 @@
   div id=content
 !-- Licensed to the Apache Software Foundation (ASF) under one or 
more --!-- contributor license agreements.  See the NOTICE file distributed 
with --!-- this work for additional information regarding copyright 
ownership. --!-- The ASF licenses this file to You under the Apache License, 
Version 2.0 --!-- (the License); you may not use this file except in 
compliance with --!-- the License.  You may obtain a copy of the License at 
--!--  --!-- http://www.apache.org/licenses/LICENSE-2.0 --!--  --!-- 
Unless required by applicable law or agreed to in writing, software --!-- 
distributed under the License is distributed on an AS IS BASIS, --!-- 
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
--!-- See the License for the specific language governing permissions and 
--!-- limitations under the License. --div class=section
 h2Supported Document Formatsa name=Supported_Document_Formats/a/h2
-pThis page lists all the document formats supported by Apache Tika 1.8. 
Follow the links to the various parser class javadocs for more detailed 
information about each document format and how it is parsed by Tika./p
+pThis page lists all the document formats supported by the parsers in Apache 
Tika 1.8. Follow the links to the various parser class javadocs for more 
detailed information about each document format and how it is parsed by 
Tika./p
+p(Please note that Apache Tika is able to detect a much wider range of 
formats than those listed below, this page only documents those formats from 
which Tika is able to extract metadata and/or textual content)/p
 ul
 lia href=#Supported_Document_FormatsSupported Document Formats/a
 ul

Modified: tika/site/publish/1.9/formats.html
URL: 
http://svn.apache.org/viewvc/tika/site/publish/1.9/formats.html?rev=1680358r1=1680357r2=1680358view=diff
==
--- tika/site/publish/1.9/formats.html (original)
+++ tika/site/publish/1.9/formats.html Tue May 19 17:57:53 2015
@@ -86,7 +86,8 @@
   div id

<    1   2   3   4   5   6   7   8   >