(tika) branch branch_2x updated: TIKA-4244 -- improve ics detection (#1731)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_2x in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/branch_2x by this push: new b419cf5eb TIKA-4244 -- improve ics detection (#1731) b419cf5eb is described below commit b419cf5eb7c87bec530718caef7313f4270abdc3 Author: Tim Allison AuthorDate: Thu Apr 25 12:01:45 2024 -0400 TIKA-4244 -- improve ics detection (#1731) (cherry picked from commit f78dc999be9c0d87a83b54aa6af74fbcf996f22e) --- .../main/resources/org/apache/tika/mime/tika-mimetypes.xml | 2 +- .../src/test/java/org/apache/tika/mime/TestMimeTypes.java | 4 .../resources/test-documents/testICalendar_w_prodId.ics | 13 + 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index 54f4b2051..c00b7f3d4 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -7213,7 +7213,7 @@ - + diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java index 886fe4ad6..bc81c7094 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java @@ -1235,6 +1235,10 @@ public class TestMimeTypes { assertType("text/x-vcalendar", "testVCalendar.vcs"); assertTypeByData("text/calendar", "testICalendar.ics"); assertTypeByData("text/x-vcalendar", "testVCalendar.vcs"); +//TIKA-4244 +//this tests detection with content intervening between the BEGIN:VCALENDAR and the VERSION:2.0 entry +assertType("text/calendar", "testICalendar_w_prodId.ics"); +assertTypeByData("text/calendar", "testICalendar_w_prodId.ics"); } @Test diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testICalendar_w_prodId.ics b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testICalendar_w_prodId.ics new file mode 100644 index 0..0af25fc46 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testICalendar_w_prodId.ics @@ -0,0 +1,13 @@ +BEGIN:VCALENDAR +PRODID:-//Example Corp//iCalendar Export//EN +VERSION:2.0 +BEGIN:VEVENT +UID:1234567...@example.com +DTSTAMP:20240101T08Z +DTSTART:20240101T10Z +DTEND:20240101T12Z +SUMMARY:Sample HTML Event +DESCRIPTION:This is a sample event with an HTML description. +X-ALT-DESC;FMTTYPE=text/html:Sample HTML EventThis is a sample event with an HTML description. +END:VEVENT +END:VCALENDAR
(tika) branch TIKA-4244 deleted (was 6772787f9)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4244 in repository https://gitbox.apache.org/repos/asf/tika.git was 6772787f9 TIKA-4244 -- improve ics detection The revisions that were on this branch are still contained in other references; therefore, this change does not discard any commits from the repository.
(tika) branch main updated: TIKA-4244 -- improve ics detection (#1731)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new f78dc999b TIKA-4244 -- improve ics detection (#1731) f78dc999b is described below commit f78dc999be9c0d87a83b54aa6af74fbcf996f22e Author: Tim Allison AuthorDate: Thu Apr 25 12:01:45 2024 -0400 TIKA-4244 -- improve ics detection (#1731) --- .../main/resources/org/apache/tika/mime/tika-mimetypes.xml | 2 +- .../src/test/java/org/apache/tika/mime/TestMimeTypes.java | 4 .../resources/test-documents/testICalendar_w_prodId.ics | 13 + 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index a1e9de0fd..09bbd963c 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -7319,7 +7319,7 @@ - + diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java index cd6705b69..a988c440e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java @@ -1235,6 +1235,10 @@ public class TestMimeTypes { assertType("text/x-vcalendar", "testVCalendar.vcs"); assertTypeByData("text/calendar", "testICalendar.ics"); assertTypeByData("text/x-vcalendar", "testVCalendar.vcs"); +//TIKA-4244 +//this tests detection with content intervening between the BEGIN:VCALENDAR and the VERSION:2.0 entry +assertType("text/calendar", "testICalendar_w_prodId.ics"); +assertTypeByData("text/calendar", "testICalendar_w_prodId.ics"); } @Test diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testICalendar_w_prodId.ics b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testICalendar_w_prodId.ics new file mode 100644 index 0..0af25fc46 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testICalendar_w_prodId.ics @@ -0,0 +1,13 @@ +BEGIN:VCALENDAR +PRODID:-//Example Corp//iCalendar Export//EN +VERSION:2.0 +BEGIN:VEVENT +UID:1234567...@example.com +DTSTAMP:20240101T08Z +DTSTART:20240101T10Z +DTEND:20240101T12Z +SUMMARY:Sample HTML Event +DESCRIPTION:This is a sample event with an HTML description. +X-ALT-DESC;FMTTYPE=text/html:Sample HTML EventThis is a sample event with an HTML description. +END:VEVENT +END:VCALENDAR
(tika) 01/01: TIKA-4244 -- improve ics detection
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4244 in repository https://gitbox.apache.org/repos/asf/tika.git commit 6772787f97471181158e90319ee7d2a682fd6365 Author: tallison AuthorDate: Thu Apr 25 11:38:34 2024 -0400 TIKA-4244 -- improve ics detection --- .../main/resources/org/apache/tika/mime/tika-mimetypes.xml | 2 +- .../src/test/java/org/apache/tika/mime/TestMimeTypes.java | 4 .../resources/test-documents/testICalendar_w_prodId.ics | 13 + 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index a1e9de0fd..09bbd963c 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -7319,7 +7319,7 @@ - + diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java index cd6705b69..a988c440e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java @@ -1235,6 +1235,10 @@ public class TestMimeTypes { assertType("text/x-vcalendar", "testVCalendar.vcs"); assertTypeByData("text/calendar", "testICalendar.ics"); assertTypeByData("text/x-vcalendar", "testVCalendar.vcs"); +//TIKA-4244 +//this tests detection with content intervening between the BEGIN:VCALENDAR and the VERSION:2.0 entry +assertType("text/calendar", "testICalendar_w_prodId.ics"); +assertTypeByData("text/calendar", "testICalendar_w_prodId.ics"); } @Test diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testICalendar_w_prodId.ics b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testICalendar_w_prodId.ics new file mode 100644 index 0..0af25fc46 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testICalendar_w_prodId.ics @@ -0,0 +1,13 @@ +BEGIN:VCALENDAR +PRODID:-//Example Corp//iCalendar Export//EN +VERSION:2.0 +BEGIN:VEVENT +UID:1234567...@example.com +DTSTAMP:20240101T08Z +DTSTART:20240101T10Z +DTEND:20240101T12Z +SUMMARY:Sample HTML Event +DESCRIPTION:This is a sample event with an HTML description. +X-ALT-DESC;FMTTYPE=text/html:Sample HTML EventThis is a sample event with an HTML description. +END:VEVENT +END:VCALENDAR
(tika) branch TIKA-4244 created (now 6772787f9)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4244 in repository https://gitbox.apache.org/repos/asf/tika.git at 6772787f9 TIKA-4244 -- improve ics detection This branch includes the following new commits: new 6772787f9 TIKA-4244 -- improve ics detection The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika) branch main updated: TIKA-4242 -- get rid of non-existent plexus utils version and update junrar exclusion for modernity (#1727)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new 74a080dff TIKA-4242 -- get rid of non-existent plexus utils version and update junrar exclusion for modernity (#1727) 74a080dff is described below commit 74a080dffcbfbc9039183cf0c0508baff707934b Author: Tim Allison AuthorDate: Wed Apr 17 12:26:38 2024 -0400 TIKA-4242 -- get rid of non-existent plexus utils version and update junrar exclusion for modernity (#1727) --- tika-parent/pom.xml| 6 -- .../tika-parsers-standard-modules/tika-parser-pkg-module/pom.xml | 7 --- 2 files changed, 13 deletions(-) diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index 6530831e7..8a96ef969 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -404,7 +404,6 @@ 2.0.14 4.13.5 3.0.2 -4.0.1 5.2.5 3.25.3 @@ -899,11 +898,6 @@ tagsoup ${tagsoup.version} - -org.codehaus.plexus -plexus-utils -${plexus.version} - org.freemarker freemarker diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/pom.xml index d3b0278a9..6844edb46 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/pom.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/pom.xml @@ -56,13 +56,6 @@ com.github.junrar junrar ${junrar.version} - - - - org.apache.commons - commons-vfs2 - - commons-codec
(tika) branch TIKA-4242 deleted (was f9588722f)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4242 in repository https://gitbox.apache.org/repos/asf/tika.git was f9588722f Merge remote-tracking branch 'refs/remotes/origin/main' into TIKA-4242 The revisions that were on this branch are still contained in other references; therefore, this change does not discard any commits from the repository.
(tika) branch TIKA-4242 updated (05800b7c1 -> f9588722f)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4242 in repository https://gitbox.apache.org/repos/asf/tika.git from 05800b7c1 TIKA-4242 -- get rid of non-existent plexus utils version and update junrar exclusion for modernity add 01f87f981 TIKA-4242: Do not require a non-existent plexus version (#1726) add f9588722f Merge remote-tracking branch 'refs/remotes/origin/main' into TIKA-4242 No new revisions were added by this update. Summary of changes:
(tika) branch main updated: TIKA-4242: Do not require a non-existent plexus version (#1726)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new 01f87f981 TIKA-4242: Do not require a non-existent plexus version (#1726) 01f87f981 is described below commit 01f87f9819b13fa8524ea4c9c5f64b9541f9c23b Author: Björn Kautler AuthorDate: Wed Apr 17 18:02:47 2024 +0200 TIKA-4242: Do not require a non-existent plexus version (#1726) --- tika-parent/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index 1c7555339..6530831e7 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -404,7 +404,7 @@ 2.0.14 4.13.5 3.0.2 -5.0.0 +4.0.1 5.2.5 3.25.3
(tika) 01/01: TIKA-4242 -- get rid of non-existent plexus utils version and update junrar exclusion for modernity
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4242 in repository https://gitbox.apache.org/repos/asf/tika.git commit 05800b7c1519f57e17be4d93fe0b675f243b4216 Author: tallison AuthorDate: Wed Apr 17 12:01:58 2024 -0400 TIKA-4242 -- get rid of non-existent plexus utils version and update junrar exclusion for modernity --- tika-parent/pom.xml| 6 -- .../tika-parsers-standard-modules/tika-parser-pkg-module/pom.xml | 7 --- 2 files changed, 13 deletions(-) diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index 1c7555339..8a96ef969 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -404,7 +404,6 @@ 2.0.14 4.13.5 3.0.2 -5.0.0 5.2.5 3.25.3 @@ -899,11 +898,6 @@ tagsoup ${tagsoup.version} - -org.codehaus.plexus -plexus-utils -${plexus.version} - org.freemarker freemarker diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/pom.xml index d3b0278a9..6844edb46 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/pom.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/pom.xml @@ -56,13 +56,6 @@ com.github.junrar junrar ${junrar.version} - - - - org.apache.commons - commons-vfs2 - - commons-codec
(tika) branch TIKA-4242 created (now 05800b7c1)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4242 in repository https://gitbox.apache.org/repos/asf/tika.git at 05800b7c1 TIKA-4242 -- get rid of non-existent plexus utils version and update junrar exclusion for modernity This branch includes the following new commits: new 05800b7c1 TIKA-4242 -- get rid of non-existent plexus utils version and update junrar exclusion for modernity The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika) branch main updated: add joda-time to management dependencies and exclude 2.12.7 (latest available) from ossindex
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new 42c208927 add joda-time to management dependencies and exclude 2.12.7 (latest available) from ossindex 42c208927 is described below commit 42c2089270b9641bfc0252bbba4e48a0c419d287 Author: tallison AuthorDate: Thu Apr 11 12:48:15 2024 -0400 add joda-time to management dependencies and exclude 2.12.7 (latest available) from ossindex --- tika-parent/pom.xml | 10 ++ 1 file changed, 10 insertions(+) diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index d639407a8..0cf363cab 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -751,6 +751,11 @@ jakarta.xml.soap-api ${jakarta.xml.soap.version} + +joda-time +joda-time +2.12.7 + junit junit @@ -1121,6 +1126,11 @@ threetenbp 1.6.8 + + joda-time + joda-time + 2.12.7 + true
(tika) branch main updated: improve logging in AsyncResource
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new b7a662925 improve logging in AsyncResource new 87e84f74b Merge remote-tracking branch 'origin/main' b7a662925 is described below commit b7a66292536c853389a8a002fa48cd85b1542400 Author: tallison AuthorDate: Thu Apr 11 12:30:01 2024 -0400 improve logging in AsyncResource --- .../main/java/org/apache/tika/server/core/resource/AsyncResource.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java index a4d4ed489..e5c00eb41 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java @@ -121,11 +121,14 @@ public class AsyncResource { try { boolean offered = asyncProcessor.offer(request.getTuples(), maxQueuePauseMs); if (offered) { +LOG.info("accepted {} tuples, capacity={}", request.getTuples().size(), asyncProcessor.getCapacity()); return ok(request.getTuples().size()); } else { +LOG.info("throttling {} tuples, capacity={}", request.getTuples().size(), asyncProcessor.getCapacity()); return throttle(request.getTuples().size()); } } catch (OfferLargerThanQueueSize e) { +LOG.info("throttling {} tuples, capacity={}", request.getTuples().size(), asyncProcessor.getCapacity()); return throttle(request.getTuples().size()); } }
(tika) branch main updated: TIKA-4240 -- backoff dependabot updates to weekly
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new 73bc66d32 TIKA-4240 -- backoff dependabot updates to weekly 73bc66d32 is described below commit 73bc66d320c9117c21b0d02b650431a04aefbf85 Author: tallison AuthorDate: Thu Apr 11 10:40:25 2024 -0400 TIKA-4240 -- backoff dependabot updates to weekly --- .github/dependabot.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 048beaf5b..d8bd0d0ea 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -21,5 +21,5 @@ updates: directory: "/" # Check for updates once daily schedule: - interval: "daily" + interval: "weekly"
(tika) branch main updated: add threeten to ossindex ignore correctly for at least two versions :/
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new 94c78cb7b add threeten to ossindex ignore correctly for at least two versions :/ 94c78cb7b is described below commit 94c78cb7b70b462aec26b8b813d5f1cf103d3423 Author: tallison AuthorDate: Wed Apr 10 12:03:43 2024 -0400 add threeten to ossindex ignore correctly for at least two versions :/ --- tika-parent/pom.xml | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index 4b1e0630d..691935755 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -,10 +,16 @@ - threetenbp - org.threeten + org.threeten + threetenbp 1.3.3 + + + org.threeten + threetenbp + 1.6.8 + true
(tika) branch main updated: add threeten to ossindex ignore
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new 614943a15 add threeten to ossindex ignore 614943a15 is described below commit 614943a154793e61ce83c01ec3b7e7e418cf4c44 Author: tallison AuthorDate: Wed Apr 10 11:11:34 2024 -0400 add threeten to ossindex ignore --- tika-parent/pom.xml | 6 ++ 1 file changed, 6 insertions(+) diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index 1fdcad74b..4b1e0630d 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -1109,6 +1109,12 @@ uimaj-core 3.4.1 + + + threetenbp + org.threeten + 1.3.3 + true
(tika) branch main updated: update intellij-code-style.xml
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new 8076b2df7 update intellij-code-style.xml 8076b2df7 is described below commit 8076b2df7445f9ea3c81c9ad0987a6cfcce2f210 Author: tallison AuthorDate: Wed Apr 10 10:58:43 2024 -0400 update intellij-code-style.xml --- tika-parent/intellij-code-style.xml | 26 +- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/tika-parent/intellij-code-style.xml b/tika-parent/intellij-code-style.xml index 3e6249b49..9768e08ba 100644 --- a/tika-parent/intellij-code-style.xml +++ b/tika-parent/intellij-code-style.xml @@ -1,22 +1,4 @@ - - + @@ -41,8 +23,9 @@ - + + @@ -50,7 +33,8 @@ - + +
(tika) branch main updated: bump checkstyle line length to 180
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new dd0e6c61e bump checkstyle line length to 180 dd0e6c61e is described below commit dd0e6c61eec6d79b776e878f64ea37beb978b3ae Author: tallison AuthorDate: Wed Apr 10 10:39:39 2024 -0400 bump checkstyle line length to 180 --- tika-parent/checkstyle.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tika-parent/checkstyle.xml b/tika-parent/checkstyle.xml index f0d55ac9b..55dc19fe7 100644 --- a/tika-parent/checkstyle.xml +++ b/tika-parent/checkstyle.xml @@ -49,7 +49,7 @@ - + ftp://"/>
(tika) branch TIKA-4235 updated (eb31117ac -> 54bfc6ba5)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4235 in repository https://gitbox.apache.org/repos/asf/tika.git from eb31117ac Merge remote-tracking branch 'origin/main' into TIKA-4235 add 54bfc6ba5 TIKA_4235 -- add pipeline parameter No new revisions were added by this update. Summary of changes: .../org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java | 3 +-- .../org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java | 5 + 2 files changed, 6 insertions(+), 2 deletions(-)
(tika) branch TIKA-4235 updated (1071c2e14 -> eb31117ac)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4235 in repository https://gitbox.apache.org/repos/asf/tika.git from 1071c2e14 TIKA_4235 -- add pipeline parameter add c36efa316 Bump aws.version from 1.12.689 to 1.12.690 add 387d72392 Merge pull request #1700 from apache/dependabot/maven/aws.version-1.12.690 add 81d193f56 Bump commons-io:commons-io from 2.15.1 to 2.16.0 add 941f8f26c Merge pull request #1701 from apache/dependabot/maven/commons-io-commons-io-2.16.0 add b34014d0d Bump aws.version from 1.12.690 to 1.12.691 add 08e47625f Merge pull request #1703 from apache/dependabot/maven/aws.version-1.12.691 add ce88af4a0 Bump aws.version from 1.12.691 to 1.12.692 add 9c941b3bf Merge pull request #1704 from apache/dependabot/maven/aws.version-1.12.692 add e01b4cab3 Bump aws.version from 1.12.692 to 1.12.693 add 8482a8b31 Merge pull request #1705 from apache/dependabot/maven/aws.version-1.12.693 add a6e874f1b Bump aws.version from 1.12.693 to 1.12.694 add 4f5f29264 Merge pull request #1706 from apache/dependabot/maven/aws.version-1.12.694 add a0d28351e Bump org.apache.maven.plugin-tools:maven-plugin-annotations add 5de0db8a8 Merge pull request #1707 from apache/dependabot/maven/org.apache.maven.plugin-tools-maven-plugin-annotations-3.12.0 add 28941613f TIKA-4234 (#1708) add eb31117ac Merge remote-tracking branch 'origin/main' into TIKA-4235 No new revisions were added by this update. Summary of changes: tika-parent/pom.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
(tika) branch TIKA-4235 created (now 1071c2e14)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4235 in repository https://gitbox.apache.org/repos/asf/tika.git at 1071c2e14 TIKA_4235 -- add pipeline parameter This branch includes the following new commits: new 1071c2e14 TIKA_4235 -- add pipeline parameter The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika) 01/01: TIKA_4235 -- add pipeline parameter
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4235 in repository https://gitbox.apache.org/repos/asf/tika.git commit 1071c2e14cab8418c44fa9b279eb5f9150ab377a Author: tallison AuthorDate: Thu Apr 4 14:02:53 2024 -0400 TIKA_4235 -- add pipeline parameter --- .../pipes/emitter/opensearch/OpenSearchClient.java | 18 +- .../pipes/emitter/opensearch/OpenSearchEmitter.java| 6 -- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java index c757115a1..8be41653e 100644 --- a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java +++ b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java @@ -21,8 +21,10 @@ import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.io.StringWriter; +import java.net.URLEncoder; import java.nio.charset.StandardCharsets; import java.util.List; +import java.util.Optional; import java.util.UUID; import com.fasterxml.jackson.core.JsonFactory; @@ -68,16 +70,21 @@ public class OpenSearchClient { } -public void emitDocuments(List emitData) throws IOException, TikaClientException { +public void emitDocuments(List emitData, Optional pipeline) throws IOException, +TikaClientException { StringBuilder json = new StringBuilder(); for (EmitData d : emitData) { appendDoc(d.getEmitKey().getEmitKey(), d.getMetadataList(), json); } -emitJson(json); +emitJson(json, pipeline); } -private void emitJson(StringBuilder json) throws IOException, TikaClientException { +private void emitJson(StringBuilder json, Optional pipeline) throws IOException, +TikaClientException { String requestUrl = openSearchUrl + "/_bulk"; +if (pipeline.isPresent()) { +requestUrl += "?pipeline=" + URLEncoder.encode(pipeline.get()); +} JsonResponse response = postJson(requestUrl, json.toString()); if (response.getStatus() != 200) { throw new TikaClientException(response.getMsg()); @@ -92,12 +99,13 @@ public class OpenSearchClient { } -public void emitDocument(String emitKey, List metadataList) throws IOException, +public void emitDocument(String emitKey, List metadataList, + Optional pipeline) throws IOException, TikaClientException { StringBuilder json = new StringBuilder(); appendDoc(emitKey, metadataList, json); -emitJson(json); +emitJson(json, pipeline); } private void appendDoc(String emitKey, List metadataList, StringBuilder json) diff --git a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java index bc010cf46..b547882cf 100644 --- a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java +++ b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Optional; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -64,6 +65,7 @@ public class OpenSearchEmitter extends AbstractEmitter implements Initializable private OpenSearchClient openSearchClient; private final HttpClientFactory httpClientFactory; private String embeddedFileFieldName = DEFAULT_EMBEDDED_FILE_FIELD_NAME; +private String pipeline = null; public OpenSearchEmitter() throws TikaConfigException { httpClientFactory = new HttpClientFactory(); @@ -77,7 +79,7 @@ public class OpenSearchEmitter extends AbstractEmitter implements Initializable } try { LOG.debug("about to emit {} docs", emitData.size()); -openSearchClient.emitDocuments(emitData); +openSearchClient.emitDocuments(emitData, Optional.ofNullable(pipeline)); LOG.info("successfully emitted {} docs", emitData.size()); } catch (TikaClientException e) { LOG.warn("problem emitting docs", e); @@ -94,7 +96,7 @@ public class OpenSearchEmitter extends AbstractEmitter implements Initializable } t
(tika) branch main updated: TIKA-4234 (#1708)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/main by this push: new 28941613f TIKA-4234 (#1708) 28941613f is described below commit 28941613fe02622fba79a7544db1270b3dabac92 Author: Tim Allison AuthorDate: Thu Apr 4 10:31:46 2024 -0400 TIKA-4234 (#1708) * TIKA_4234 -- improve jdbc reporter --- .../pipes/reporters/jdbc/JDBCPipesReporter.java| 126 +++-- .../reporters/jdbc/TestJDBCPipesReporter.java | 33 ++ .../resources/configs/tika-config-advanced.xml | 50 3 files changed, 199 insertions(+), 10 deletions(-) diff --git a/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java b/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java index ee52bf80f..a1ffc8b15 100644 --- a/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java +++ b/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java @@ -69,6 +69,14 @@ public class JDBCPipesReporter extends PipesReporterBase implements Initializabl private String connectionString; +private boolean createTable = true; + +private String tableName = TABLE_NAME; + +private String reportSql; + +private List reportVariables; + private Optional postConnectionString = Optional.empty(); private final ArrayBlockingQueue queue = new ArrayBlockingQueue(ARRAY_BLOCKING_QUEUE_SIZE); @@ -80,6 +88,15 @@ public class JDBCPipesReporter extends PipesReporterBase implements Initializabl if (StringUtils.isBlank(connectionString)) { throw new TikaConfigException("Must specify a connectionString"); } +if (reportVariables == null) { +reportVariables = new ArrayList<>(); +reportVariables.add("id"); +reportVariables.add("status"); +reportVariables.add("timestamp"); +} +if (reportSql == null) { +reportSql = "insert into " + getTableName() + " (id, status, timestamp) values (?,?,?)"; +} ReportWorker reportWorker = new ReportWorker(connectionString, postConnectionString, queue, cacheSize, reportWithinMs); reportWorker.init(); @@ -113,6 +130,76 @@ public class JDBCPipesReporter extends PipesReporterBase implements Initializabl this.cacheSize = cacheSize; } +/** + * The default is true. In a distributed setting with multiple + * servers, this should be set to false, and you'll need to set up + * the table on your own. + * + * NOTE The default behavior is to drop the table if it exists and + * then create it. Make sure to set this to false if you do not want + * to drop the table. + * @param createTable + */ +@Field +public void setCreateTable(boolean createTable) { +this.createTable = createTable; +} + +/** + * The default is {@link JDBCPipesReporter#TABLE_NAME} + * @param tableName + */ +@Field +public void setTableName(String tableName) { +this.tableName = tableName; +} + +/** + * This is the sql for the prepared statement to execute + * to store the report record. the default is: + * insert into tika_status (id, status, timestamp) values (?,?,?) + * + * This can be modified for specific dialects of SQL or to run an upsert, merge or update + * instead of the default insert. + * + * Users need to coordinate this with {@link #setReportVariables(List)} + * @param reportSql + */ +@Field +public void setReportSql(String reportSql) { +this.reportSql = reportSql; +} + +public String getTableName() { +return tableName; +} + +public List getReportVariables() { +return reportVariables; +} + +public String getReportSql() { +return reportSql; +} + +public boolean isCreateTable() { +return createTable; +} +/** + * ADVANCED: This is used to set the variables in the prepared statement for + * the report. This needs to be coordinated with {@link #setReportSql(String)}. + * The available variables are "id, status, timestamp". If you're modifying to an update + * statement like "update table tika_status set status=?, timestamp=? where id = ?" + * then the values for this would be ["status", "timestamp", "id"]. + * + * The default for the insert is ["id&qu
(tika) branch TIKA-4234 deleted (was a29488c60)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4234 in repository https://gitbox.apache.org/repos/asf/tika.git was a29488c60 TIKA_4234 -- improve jdbc reporter, improve comment The revisions that were on this branch are still contained in other references; therefore, this change does not discard any commits from the repository.
(tika) branch TIKA-4234 updated: TIKA_4234 -- improve jdbc reporter, improve comment
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4234 in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/TIKA-4234 by this push: new a29488c60 TIKA_4234 -- improve jdbc reporter, improve comment a29488c60 is described below commit a29488c6087af269a36bab1aff13f56de549a6cf Author: tallison AuthorDate: Thu Apr 4 10:09:26 2024 -0400 TIKA_4234 -- improve jdbc reporter, improve comment --- .../java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java | 1 + 1 file changed, 1 insertion(+) diff --git a/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java b/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java index 51abee666..a1ffc8b15 100644 --- a/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java +++ b/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java @@ -323,6 +323,7 @@ public class JDBCPipesReporter extends PipesReporterBase implements Initializabl if (isCreateTable()) { createTable(); } +//table must exist for this to work createPreparedStatement(); } catch (SQLException e) { throw new TikaConfigException("Problem creating connection, etc", e);
(tika) branch TIKA-4234 created (now f41f047f1)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4234 in repository https://gitbox.apache.org/repos/asf/tika.git at f41f047f1 TIKA_4234 -- improve jdbc reporter This branch includes the following new commits: new f41f047f1 TIKA_4234 -- improve jdbc reporter The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika) 01/01: TIKA_4234 -- improve jdbc reporter
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4234 in repository https://gitbox.apache.org/repos/asf/tika.git commit f41f047f1c22945fcb2f384253fa507565af77c9 Author: tallison AuthorDate: Thu Apr 4 10:07:41 2024 -0400 TIKA_4234 -- improve jdbc reporter --- .../pipes/reporters/jdbc/JDBCPipesReporter.java| 125 +++-- .../reporters/jdbc/TestJDBCPipesReporter.java | 33 ++ .../resources/configs/tika-config-advanced.xml | 50 + 3 files changed, 198 insertions(+), 10 deletions(-) diff --git a/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java b/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java index ee52bf80f..51abee666 100644 --- a/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java +++ b/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java @@ -69,6 +69,14 @@ public class JDBCPipesReporter extends PipesReporterBase implements Initializabl private String connectionString; +private boolean createTable = true; + +private String tableName = TABLE_NAME; + +private String reportSql; + +private List reportVariables; + private Optional postConnectionString = Optional.empty(); private final ArrayBlockingQueue queue = new ArrayBlockingQueue(ARRAY_BLOCKING_QUEUE_SIZE); @@ -80,6 +88,15 @@ public class JDBCPipesReporter extends PipesReporterBase implements Initializabl if (StringUtils.isBlank(connectionString)) { throw new TikaConfigException("Must specify a connectionString"); } +if (reportVariables == null) { +reportVariables = new ArrayList<>(); +reportVariables.add("id"); +reportVariables.add("status"); +reportVariables.add("timestamp"); +} +if (reportSql == null) { +reportSql = "insert into " + getTableName() + " (id, status, timestamp) values (?,?,?)"; +} ReportWorker reportWorker = new ReportWorker(connectionString, postConnectionString, queue, cacheSize, reportWithinMs); reportWorker.init(); @@ -113,6 +130,76 @@ public class JDBCPipesReporter extends PipesReporterBase implements Initializabl this.cacheSize = cacheSize; } +/** + * The default is true. In a distributed setting with multiple + * servers, this should be set to false, and you'll need to set up + * the table on your own. + * + * NOTE The default behavior is to drop the table if it exists and + * then create it. Make sure to set this to false if you do not want + * to drop the table. + * @param createTable + */ +@Field +public void setCreateTable(boolean createTable) { +this.createTable = createTable; +} + +/** + * The default is {@link JDBCPipesReporter#TABLE_NAME} + * @param tableName + */ +@Field +public void setTableName(String tableName) { +this.tableName = tableName; +} + +/** + * This is the sql for the prepared statement to execute + * to store the report record. the default is: + * insert into tika_status (id, status, timestamp) values (?,?,?) + * + * This can be modified for specific dialects of SQL or to run an upsert, merge or update + * instead of the default insert. + * + * Users need to coordinate this with {@link #setReportVariables(List)} + * @param reportSql + */ +@Field +public void setReportSql(String reportSql) { +this.reportSql = reportSql; +} + +public String getTableName() { +return tableName; +} + +public List getReportVariables() { +return reportVariables; +} + +public String getReportSql() { +return reportSql; +} + +public boolean isCreateTable() { +return createTable; +} +/** + * ADVANCED: This is used to set the variables in the prepared statement for + * the report. This needs to be coordinated with {@link #setReportSql(String)}. + * The available variables are "id, status, timestamp". If you're modifying to an update + * statement like "update table tika_status set status=?, timestamp=? where id = ?" + * then the values for this would be ["status", "timestamp", "id"]. + * + * The default for the insert is ["id", "status", "timestamp"] + * @param variables + */ + +@Field +public void setReportVariables(List variables) { +reportVariables = va
(tika-helm) 01/01: 2.9.2.0 release
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch 2.9.2.0-release in repository https://gitbox.apache.org/repos/asf/tika-helm.git commit c3c726fa1f39ee5e1ad59f309fbaab1a2a16acd1 Author: tallison AuthorDate: Tue Apr 2 15:19:29 2024 -0400 2.9.2.0 release --- Chart.yaml | 4 ++-- values.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Chart.yaml b/Chart.yaml index 5dc506b..cd2ea84 100644 --- a/Chart.yaml +++ b/Chart.yaml @@ -19,8 +19,8 @@ --- apiVersion: v2 name: tika -appVersion: "2.9.0.0-full" -version: "2.9.0-full" +appVersion: "2.9.2.0-full" +version: "2.9.2-full" description: The official Helm chart for Apache Tika type: application keywords: diff --git a/values.yaml b/values.yaml index a446121..80c2e5f 100644 --- a/values.yaml +++ b/values.yaml @@ -23,7 +23,7 @@ image: repository: apache/tika pullPolicy: IfNotPresent # Overrides the image tag whose default is the chart appVersion. - tag: "2.9.0.0-full" + tag: "2.9.2.0-full" imagePullSecrets: [] nameOverride: ""
(tika-helm) branch 2.9.2.0-release created (now c3c726f)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch 2.9.2.0-release in repository https://gitbox.apache.org/repos/asf/tika-helm.git at c3c726f 2.9.2.0 release This branch includes the following new commits: new c3c726f 2.9.2.0 release The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
(tika-docker) branch main updated: prep for 2.9.2.0 release
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika-docker.git The following commit(s) were added to refs/heads/main by this push: new 83d1eda prep for 2.9.2.0 release 83d1eda is described below commit 83d1eda81a0852241d24b1fe603f87b33a596868 Author: tallison AuthorDate: Tue Apr 2 14:56:25 2024 -0400 prep for 2.9.2.0 release --- .env | 2 +- CHANGES.md | 3 +++ README.md | 4 full/Dockerfile| 5 +++-- minimal/Dockerfile | 7 +-- 5 files changed, 16 insertions(+), 5 deletions(-) diff --git a/.env b/.env index e396322..8a5e270 100644 --- a/.env +++ b/.env @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -TAG=2.9.1.0 \ No newline at end of file +TAG=2.9.2.0 \ No newline at end of file diff --git a/CHANGES.md b/CHANGES.md index 2c2c4ec..7b3e3d0 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,6 +4,9 @@ As of 2.5.0.1, we started adding a digit for Docker versions. Going forward, we a four digit version, where the first three are the Tika version and the last one is the docker version. As of 2.5.0.2, we started tagging release commits in our github repo. +* 2.9.2.0 (10 October 2023) + * Initial release for Tika 2.9.2 + * 2.9.1.0 (10 October 2023) * Initial release for Tika 2.9.1 diff --git a/README.md b/README.md index 1ff0543..78e0baa 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,10 @@ To install more languages simply update the apt-get command to include the packa ## Available Tags Below are the most recent 2.x series tags: +- `latest`, `2.9.2.0`: Apache Tika Server 2.9.2.0 (Minimal) +- `latest-full`, `2.9.2.0-full`: Apache Tika Server 2.9.2.0 (Full) +- `2.9.2.0`, `2.9.2.0`: Apache Tika Server 2.9.2.0 (Minimal) +- `2.9.2.0`, `2.9.2.0-full`: Apache Tika Server 2.9.2.0 (Full) - `latest`, `2.9.1.0`: Apache Tika Server 2.9.1.0 (Minimal) - `latest-full`, `2.9.1.0-full`: Apache Tika Server 2.9.1.0 (Full) - `2.9.1.0`, `2.9.1.0`: Apache Tika Server 2.9.1.0 (Minimal) diff --git a/full/Dockerfile b/full/Dockerfile index 96d09ee..dfe369a 100644 --- a/full/Dockerfile +++ b/full/Dockerfile @@ -37,9 +37,10 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get -y install gnupg2 w && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || exit 1 \ && wget -t 10 --max-redirect 1 --retry-connrefused $DEFAULT_TIKA_SERVER_ASC_URL -O /tika-server-standard-${TIKA_VERSION}.jar.asc || rm /tika-server-standard-${TIKA_VERSION}.jar.asc \ && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar.asc ]" || wget $ARCHIVE_TIKA_SERVER_ASC_URL -O /tika-server-standard-${TIKA_VERSION}.jar.asc || rm /tika-server-standard-${TIKA_VERSION}.jar.asc \ -&& sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar.asc ]" || exit 1; +&& sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar.asc ]" || exit 1 \ +&& gpg --verify /tika-server-standard-${TIKA_VERSION}.jar.asc /tika-server-standard-${TIKA_VERSION}.jar -RUN if [ "$CHECK_SIG" = "true" ] ; then gpg --verify /tika-server-standard-${TIKA_VERSION}.jar.asc /tika-server-standard-${TIKA_VERSION}.jar; fi +#RUN if [ "$CHECK_SIG" = "true" ] ; then gpg --verify /tika-server-standard-${TIKA_VERSION}.jar.asc /tika-server-standard-${TIKA_VERSION}.jar; fi FROM base as runtime ARG UID_GID diff --git a/minimal/Dockerfile b/minimal/Dockerfile index ce413e0..9c05179 100644 --- a/minimal/Dockerfile +++ b/minimal/Dockerfile @@ -43,9 +43,12 @@ RUN set -eux \ && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || exit 1 \ && wget -t 10 --max-redirect 1 --retry-connrefused $DEFAULT_TIKA_SERVER_ASC_URL -O /tika-server-standard-${TIKA_VERSION}.jar.asc || rm /tika-server-standard-${TIKA_VERSION}.jar.asc \ && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar.asc ]" || wget $ARCHIVE_TIKA_SERVER_ASC_URL -O /tika-server-standard-${TIKA_VERSION}.jar.asc || rm /tika-server-standard-${TIKA_VERSION}.jar.asc \ -&& sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar.asc ]" || exit 1; +&& sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar.asc ]" || exit 1 \ +&& gpg --verify /tika-server-standard-${TIKA_VERSION}.jar.asc /tika-server-standard-${TIKA_VERSION}.jar -RUN if [ "$CHECK_SIG" = "true" ] ; then gpg --verify /tika-server-standard-${TIKA_VERSION}.jar.asc /tika-server-standard-${TIKA_VERSION}.jar; fi +# this used to work, but I'm getting "ERROR: failed to solve: failed to prepare $data as $data2: invalid argument" +# when trying to build 2.9.2.0 +#RUN if [ "$CHECK_SIG" = "true" ] ; then gpg --verify /tika-server-standard-${TIKA_VERSION}.jar.asc /tika-server-standard-${TIKA_VERSION}.jar; fi FROM base as runtime # must reference uid_gid
svn commit: r68267 [1/2] - in /release/tika/2.9.2: CHANGES-2.9.2.txt CHANGES.txt
Author: tallison Date: Tue Apr 2 18:29:06 2024 New Revision: 68267 Log: fix changes.txt name Added: release/tika/2.9.2/CHANGES-2.9.2.txt Removed: release/tika/2.9.2/CHANGES.txt
svn commit: r68267 [2/2] - in /release/tika/2.9.2: CHANGES-2.9.2.txt CHANGES.txt
Added: release/tika/2.9.2/CHANGES-2.9.2.txt == --- release/tika/2.9.2/CHANGES-2.9.2.txt (added) +++ release/tika/2.9.2/CHANGES-2.9.2.txt Tue Apr 2 18:29:06 2024 @@ -0,0 +1,3205 @@ +Release 2.9.2 - 3/26/2024 + + * Dependency upgrades including temporary workarounds for regressions in commons-compress. + + * Add detection for OpenSCAD, 3MF, AMF, STL file formats via Robin Schimpf (TIKA-4222, TIKA-4223, + TIKA-4224, TIKA-4225). + +Release 2.9.1 - 10/17/2023 + + * Dependency upgrades including commons-compress to fix CVE-2023-42503. + + * Improve RFC822 detection (TIKA-4153). + + * Enable configuration of "maxJsonStringFieldLength" in TikaConfig to allow users to + avoid DEFAULT_MAX_STRING_LEN exceptions from Jackson (TIKA-4154). + + * Fix bug in DateUtils that stripped timezone information from + incoming Calendar objects (TIKA-4126). + + * The InputStreamDigester now calculates stream length (TIKA-4016). + +Release 2.9.0 - 8/23/2023 + + * With user configuration, the PDFParser can now throw an EncryptedDocumentException + for Microsoft IRM PDF containers with encrypted payloads. Separately, + the PDFParser now throws an EncryptedDocumentException instead of an IOException + if the security handler cannot be found (TIKA-4082). + + * Fix bug that led to duplicate extraction of macros from some OLE2 containers (TIKA-4116). + + * Parse iframe's srcdoc as an embedded file (TIKA-3109). + + * Add detection of warc.gz as a specialization of gz and parse as if a standard WARC (TIKA-4048). + + * Allow users to modify the attachment limit size in the /unpack resource (TIKA-4039) + + * Fixed write limit bug in RecursiveParserWrapper (TIKA-4055). + + * Add mime detection for many files with thanks to Gregory Lepore (TIKA-3992). + + * Fixed iWork 13 keynote detection on files with wrong extension (TIKA-4111). + +Release 2.8.0 - 5/11/2023 + + * Enable counting and/or parsing of incremental updates in PDFs. This + is an experimental feature and may change in later releases (TIKA-4017). + + * Fixed bug that prevented the the loading of CompositeExternalParser in tika-app and + tika-server-standard. This parser will call exiftool and ffmpeg if those are installed, as was + the behavior in Tika 1.x. Exclude org.apache.tika.parser.external.CompositeExternalParser + if you do not want this behavior (TIKA-4022). + + * Removed the shading of tika-parsers-standard-module (TIKA-4038). + + * Enable optional extraction of file system metadata in FileSystemFetcher (TIKA-4035). + + * Allow pretty printing in FileSystemEmitter (TIKA-4034). + + * Add detection for and a new mime type for older postscript-based + Adobe Illustrator "application/illustrator+ps" files (TIKA-3971). + + * Add magic detection for canon raw file types: crw, cr2 and cr3 (TIKA-3991). + + * Add detection for ONIX message files (TIKA-4011). + + * Add detection and a parser for ActiveMime files (TIKA-3987). + + * Add extraction of rendition layout value and version from Epub (TIKA-4013). + + * Improve embedded file extraction from PDFs (TIKA-4012). + + * Improve metadata extraction from WARCs (TIKA-4018). + + * Update to PDFBox 2.0.28 (TIKA-4016). + + * Users may now avoid the ZeroByteFileException via a + setting on the AutoDetectParserConfig (TIKA-3976). + + * Fix bug in closing elements in the presence of elements + in RTF files (TIKA-3972). + + * Improve extraction of embedded file names in .docx (TIKA-3968). + + * Normalize author, title, subject and description to their Dublin Core + properties in the HTMLParser (TIKA-3963). + + +Release 2.7.0 - 1/31/2023 + + * Add SVG detection for svg files that lack the xml header (TIKA-3308). + + * Migrate to a live fork of Universal Charset Detector (TIKA-3213). + + * Improve handling of text-based attachments inside .eml files (TIKA-3959). + + * Add tika-parser-nlp-package to release artifacts (TIKA-3958). + + * Remove need for element in classes that extend ConfigBase (TIKA-3946). + + * Add X-TIKA:embedded_id_path to ensure unique embedded file paths (TIKA-3942). + + * Fix bug that prevented digests when the fallback/EmptyParser + was called (TIKA-3939). + + * Remove log4j 1.2.x (and slf4j-log4j12 which now redirects to slf4j-reload4j) from + all modules (TIKA-3935). + + * Upgrade mime4j to 0.8.9 (TIKA-3950). + + * Refactor date parsing for emails (TIKA-3957) + + * Upgrade to Bouncy Castle 1.71 and jdk18on jars (TIKA-3933). + + * Add a JDBCPipesReporter (TIKA-3931). + + * Add multivalued field strategy option in jdbc-emitter (TIKA-3930). + Default is now 'concatenate' with ', ' as the delimiter. + + * Downgrade logging in PipesClient for each parse from info to debug. + +Release 2.6.0 - 11/3/2022 + + * Add optional Siegfried detector (TIKA-3901). + + *
svn commit: r1916753 - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12/ publis
Author: tallison Date: Tue Apr 2 18:25:06 2024 New Revision: 1916753 URL: http://svn.apache.org/viewvc?rev=1916753=rev Log: Update website for 2.9.2 release [This commit notification would consist of 581 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.]
svn commit: r1916752 [46/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/3.0.0-BETA/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/3.0.0-BETA/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/3.0.0-BETA/examples.html (original) +++ tika/site/publish/3.0.0-BETA/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the
svn commit: r1916752 [42/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/2.7.0/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/2.7.0/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/2.7.0/examples.html (original) +++ tika/site/publish/2.7.0/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { p;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [28/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/1.28/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.28/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.28/examples.html (original) +++ tika/site/publish/1.28/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [43/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/2.8.0/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/2.8.0/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/2.8.0/examples.html (original) +++ tika/site/publish/2.8.0/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [47/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/license.html URL: http://svn.apache.org/viewvc/tika/site/publish/license.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/license.html (original) +++ tika/site/publish/license.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("./css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -980,20 +943,7 @@ - -Search with Apache Solr -http://search.lucidimagination.com/p:tika; - method="get" id="searchform"> - - -provider -Lucid Find -Search-Lucene - - - - + Books about Tika @@ -1007,13 +957,10 @@ - Copyright 2023 + Copyright 2024 https://www.apache.org/;>The Apache Software Foundation. Site powered by https://maven.apache.org/;>Apache Maven. - Search powered by - http://www.lucidimagination.com;>Lucid Imagination - and http://sematext.com;>Sematext. - + Apache Tika, Tika, Apache, the Apache feather logo, and the Apache Tika project logo are trademarks of The Apache Software Foundation. Modified: tika/site/publish/mail-lists.html URL: http://svn.apache.org/viewvc/tika/site/publish/mail-lists.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/mail-lists.html (original) +++ tika/site/publish/mail-lists.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("./css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org;
svn commit: r1916752 [34/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/2.1.0/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/2.1.0/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/2.1.0/examples.html (original) +++ tika/site/publish/2.1.0/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [20/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/1.25/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.25/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.25/examples.html (original) +++ tika/site/publish/1.25/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [29/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/1.3/parser.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.3/parser.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.3/parser.html (original) +++ tika/site/publish/1.3/parser.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -859,20 +822,7 @@ try { - -Search with Apache Solr -http://search.lucidimagination.com/p:tika; - method="get" id="searchform"> - - -provider -Lucid Find -Search-Lucene - - - - + Books about Tika @@ -886,13 +836,10 @@ try { - Copyright 2023 + Copyright 2024 https://www.apache.org/;>The Apache Software Foundation. Site powered by https://maven.apache.org/;>Apache Maven. - Search powered by - http://www.lucidimagination.com;>Lucid Imagination - and http://sematext.com;>Sematext. - + Apache Tika, Tika, Apache, the Apache feather logo, and the Apache Tika project logo are trademarks of The Apache Software Foundation. Modified: tika/site/publish/1.3/parser_guide.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.3/parser_guide.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.3/parser_guide.html (original) +++ tika/site/publish/1.3/parser_guide.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - +
svn commit: r1916752 [30/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/1.7/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.7/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.7/examples.html (original) +++ tika/site/publish/1.7/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -115,23 +78,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [33/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/2.0.0/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/2.0.0/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/2.0.0/examples.html (original) +++ tika/site/publish/2.0.0/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [31/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/1.8/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.8/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.8/examples.html (original) +++ tika/site/publish/1.8/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -115,23 +78,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [13/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/1.19/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.19/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.19/examples.html (original) +++ tika/site/publish/1.19/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [11/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/1.18/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.18/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.18/examples.html (original) +++ tika/site/publish/1.18/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [32/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/1.9/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.9/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.9/examples.html (original) +++ tika/site/publish/1.9/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [7/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12/
Modified: tika/site/publish/1.14/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.14/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.14/examples.html (original) +++ tika/site/publish/1.14/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [14/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/1.2/parser.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.2/parser.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.2/parser.html (original) +++ tika/site/publish/1.2/parser.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -859,20 +822,7 @@ try { - -Search with Apache Solr -http://search.lucidimagination.com/p:tika; - method="get" id="searchform"> - - -provider -Lucid Find -Search-Lucene - - - - + Books about Tika @@ -886,13 +836,10 @@ try { - Copyright 2023 + Copyright 2024 https://www.apache.org/;>The Apache Software Foundation. Site powered by https://maven.apache.org/;>Apache Maven. - Search powered by - http://www.lucidimagination.com;>Lucid Imagination - and http://sematext.com;>Sematext. - + Apache Tika, Tika, Apache, the Apache feather logo, and the Apache Tika project logo are trademarks of The Apache Software Foundation. Modified: tika/site/publish/1.2/parser_guide.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.2/parser_guide.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.2/parser_guide.html (original) +++ tika/site/publish/1.2/parser_guide.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - +
svn commit: r1916752 [10/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/1.17/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.17/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.17/examples.html (original) +++ tika/site/publish/1.17/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [3/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12/
Modified: tika/site/publish/1.1/parser_guide.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.1/parser_guide.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.1/parser_guide.html (original) +++ tika/site/publish/1.1/parser_guide.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -864,20 +827,7 @@ public class HelloParser implements Pars - -Search with Apache Solr -http://search.lucidimagination.com/p:tika; - method="get" id="searchform"> - - -provider -Lucid Find -Search-Lucene - - - - + Books about Tika @@ -891,13 +841,10 @@ public class HelloParser implements Pars - Copyright 2023 + Copyright 2024 https://www.apache.org/;>The Apache Software Foundation. Site powered by https://maven.apache.org/;>Apache Maven. - Search powered by - http://www.lucidimagination.com;>Lucid Imagination - and http://sematext.com;>Sematext. - + Apache Tika, Tika, Apache, the Apache feather logo, and the Apache Tika project logo are trademarks of The Apache Software Foundation. Modified: tika/site/publish/1.10/configuring.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.10/configuring.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.10/configuring.html (original) +++ tika/site/publish/1.10/configuring.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -
svn commit: r1916752 [4/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12/
Modified: tika/site/publish/1.11/detection.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.11/detection.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.11/detection.html (original) +++ tika/site/publish/1.11/detection.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -886,20 +849,7 @@ for (InputStream is : myListOfStreams) { - -Search with Apache Solr -http://search.lucidimagination.com/p:tika; - method="get" id="searchform"> - - -provider -Lucid Find -Search-Lucene - - - - + Books about Tika @@ -913,13 +863,10 @@ for (InputStream is : myListOfStreams) { - Copyright 2023 + Copyright 2024 https://www.apache.org/;>The Apache Software Foundation. Site powered by https://maven.apache.org/;>Apache Maven. - Search powered by - http://www.lucidimagination.com;>Lucid Imagination - and http://sematext.com;>Sematext. - + Apache Tika, Tika, Apache, the Apache feather logo, and the Apache Tika project logo are trademarks of The Apache Software Foundation. Modified: tika/site/publish/1.11/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.11/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.11/examples.html (original) +++ tika/site/publish/1.11/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -
svn commit: r1916752 [6/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12/
Modified: tika/site/publish/1.13/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.13/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.13/examples.html (original) +++ tika/site/publish/1.13/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [15/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/1.21/configuring.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.21/configuring.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.21/configuring.html (original) +++ tika/site/publish/1.21/configuring.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -923,20 +886,7 @@ Parser autoDetectParser = new AutoDetect - -Search with Apache Solr -http://search.lucidimagination.com/p:tika; - method="get" id="searchform"> - - -provider -Lucid Find -Search-Lucene - - - - + Books about Tika @@ -950,13 +900,10 @@ Parser autoDetectParser = new AutoDetect - Copyright 2023 + Copyright 2024 https://www.apache.org/;>The Apache Software Foundation. Site powered by https://maven.apache.org/;>Apache Maven. - Search powered by - http://www.lucidimagination.com;>Lucid Imagination - and http://sematext.com;>Sematext. - + Apache Tika, Tika, Apache, the Apache feather logo, and the Apache Tika project logo are trademarks of The Apache Software Foundation. Modified: tika/site/publish/1.21/detection.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.21/detection.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.21/detection.html (original) +++ tika/site/publish/1.21/detection.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -
svn commit: r1916752 [5/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12/
Modified: tika/site/publish/1.12/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.12/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.12/examples.html (original) +++ tika/site/publish/1.12/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [1/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12/
Author: tallison Date: Tue Apr 2 18:03:41 2024 New Revision: 1916752 URL: http://svn.apache.org/viewvc?rev=1916752=rev Log: remove outdated search providers Modified: tika/site/publish/0.10/detection.html tika/site/publish/0.10/formats.html tika/site/publish/0.10/gettingstarted.html tika/site/publish/0.10/index.html tika/site/publish/0.10/parser.html tika/site/publish/0.10/parser_guide.html tika/site/publish/0.5/documentation.html tika/site/publish/0.5/formats.html tika/site/publish/0.5/gettingstarted.html tika/site/publish/0.5/index.html tika/site/publish/0.6/formats.html tika/site/publish/0.6/gettingstarted.html tika/site/publish/0.6/index.html tika/site/publish/0.6/parser.html tika/site/publish/0.7/detection.html tika/site/publish/0.7/formats.html tika/site/publish/0.7/gettingstarted.html tika/site/publish/0.7/index.html tika/site/publish/0.7/parser.html tika/site/publish/0.7/parser_guide.html tika/site/publish/0.8/detection.html tika/site/publish/0.8/formats.html tika/site/publish/0.8/gettingstarted.html tika/site/publish/0.8/index.html tika/site/publish/0.8/parser.html tika/site/publish/0.8/parser_guide.html tika/site/publish/0.9/detection.html tika/site/publish/0.9/formats.html tika/site/publish/0.9/gettingstarted.html tika/site/publish/0.9/index.html tika/site/publish/0.9/parser.html tika/site/publish/0.9/parser_guide.html tika/site/publish/1.0/detection.html tika/site/publish/1.0/formats.html tika/site/publish/1.0/gettingstarted.html tika/site/publish/1.0/index.html tika/site/publish/1.0/parser.html tika/site/publish/1.0/parser_guide.html tika/site/publish/1.1/detection.html tika/site/publish/1.1/formats.html tika/site/publish/1.1/gettingstarted.html tika/site/publish/1.1/index.html tika/site/publish/1.1/parser.html tika/site/publish/1.1/parser_guide.html tika/site/publish/1.10/configuring.html tika/site/publish/1.10/detection.html tika/site/publish/1.10/examples.html tika/site/publish/1.10/formats.html tika/site/publish/1.10/gettingstarted.html tika/site/publish/1.10/index.html tika/site/publish/1.10/parser.html tika/site/publish/1.10/parser_guide.html tika/site/publish/1.11/configuring.html tika/site/publish/1.11/detection.html tika/site/publish/1.11/examples.html tika/site/publish/1.11/formats.html tika/site/publish/1.11/gettingstarted.html tika/site/publish/1.11/index.html tika/site/publish/1.11/parser.html tika/site/publish/1.11/parser_guide.html tika/site/publish/1.12/configuring.html tika/site/publish/1.12/detection.html tika/site/publish/1.12/examples.html tika/site/publish/1.12/formats.html tika/site/publish/1.12/gettingstarted.html tika/site/publish/1.12/index.html tika/site/publish/1.12/parser.html tika/site/publish/1.12/parser_guide.html tika/site/publish/1.13/configuring.html tika/site/publish/1.13/detection.html tika/site/publish/1.13/examples.html tika/site/publish/1.13/formats.html tika/site/publish/1.13/gettingstarted.html tika/site/publish/1.13/index.html tika/site/publish/1.13/parser.html tika/site/publish/1.13/parser_guide.html tika/site/publish/1.14/configuring.html tika/site/publish/1.14/detection.html tika/site/publish/1.14/examples.html tika/site/publish/1.14/formats.html tika/site/publish/1.14/gettingstarted.html tika/site/publish/1.14/index.html tika/site/publish/1.14/parser.html tika/site/publish/1.14/parser_guide.html tika/site/publish/1.15/configuring.html tika/site/publish/1.15/detection.html tika/site/publish/1.15/examples.html tika/site/publish/1.15/formats.html tika/site/publish/1.15/gettingstarted.html tika/site/publish/1.15/index.html tika/site/publish/1.15/parser.html tika/site/publish/1.15/parser_guide.html tika/site/publish/1.16/configuring.html tika/site/publish/1.16/detection.html tika/site/publish/1.16/examples.html tika/site/publish/1.16/formats.html tika/site/publish/1.16/gettingstarted.html tika/site/publish/1.16/index.html tika/site/publish/1.16/parser.html tika/site/publish/1.16/parser_guide.html tika/site/publish/1.17/configuring.html tika/site/publish/1.17/detection.html tika/site/publish/1.17/examples.html tika/site/publish/1.17/formats.html tika/site/publish/1.17/gettingstarted.html tika/site/publish/1.17/index.html tika/site/publish/1.17/parser.html tika/site/publish/1.17/parser_guide.html tika/site/publish/1.18/configuring.html tika/site/publish/1.18/detection.html tika/site/publish/1.18/examples.html tika/site/publish/1.18/formats.html tika/site/publish/1.18/gettingstarted.html tika/site/publish/1.18/index.html tika/site/publish/1.18/parser.html tika/site/publish/1.18/parser_guide.html
svn commit: r1916752 [39/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/2.4.1/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/2.4.1/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/2.4.1/examples.html (original) +++ tika/site/publish/2.4.1/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [37/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/2.3.0/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/2.3.0/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/2.3.0/examples.html (original) +++ tika/site/publish/2.3.0/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [36/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/2.2.1/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/2.2.1/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/2.2.1/examples.html (original) +++ tika/site/publish/2.2.1/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [35/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/2.2.0/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/2.2.0/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/2.2.0/examples.html (original) +++ tika/site/publish/2.2.0/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [21/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/1.26/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.26/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.26/examples.html (original) +++ tika/site/publish/1.26/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [17/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/1.23/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.23/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.23/examples.html (original) +++ tika/site/publish/1.23/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [41/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/2.6.0/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/2.6.0/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/2.6.0/examples.html (original) +++ tika/site/publish/2.6.0/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [38/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/2.4.0/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/2.4.0/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/2.4.0/examples.html (original) +++ tika/site/publish/2.4.0/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [26/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/1.28.4/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.28.4/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.28.4/examples.html (original) +++ tika/site/publish/1.28.4/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { p;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as
svn commit: r1916752 [44/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/2.9.0/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/2.9.0/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/2.9.0/examples.html (original) +++ tika/site/publish/2.9.0/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [40/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/2.5.0/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/2.5.0/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/2.5.0/examples.html (original) +++ tika/site/publish/2.5.0/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [45/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/2.9.1/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/2.9.1/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/2.9.1/examples.html (original) +++ tika/site/publish/2.9.1/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [23/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/1.28.1/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.28.1/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.28.1/examples.html (original) +++ tika/site/publish/1.28.1/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body
svn commit: r1916752 [27/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/1.28.5/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.28.5/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.28.5/examples.html (original) +++ tika/site/publish/1.28.5/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { p;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as
svn commit: r1916752 [18/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/1.24.1/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.24.1/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.24.1/examples.html (original) +++ tika/site/publish/1.24.1/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body
svn commit: r1916752 [19/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/1.24/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.24/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.24/examples.html (original) +++ tika/site/publish/1.24/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [16/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/1.22/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.22/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.22/examples.html (original) +++ tika/site/publish/1.22/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [24/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/1.28.2/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.28.2/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.28.2/examples.html (original) +++ tika/site/publish/1.28.2/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body
svn commit: r1916752 [25/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/1.28.3/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.28.3/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.28.3/examples.html (original) +++ tika/site/publish/1.28.3/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { p;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as
svn commit: r1916752 [22/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/1.27/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.27/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.27/examples.html (original) +++ tika/site/publish/1.27/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [12/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12
Modified: tika/site/publish/1.19.1/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.19.1/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.19.1/examples.html (original) +++ tika/site/publish/1.19.1/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body
svn commit: r1916752 [2/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12/
Modified: tika/site/publish/0.7/parser_guide.html URL: http://svn.apache.org/viewvc/tika/site/publish/0.7/parser_guide.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/0.7/parser_guide.html (original) +++ tika/site/publish/0.7/parser_guide.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -864,20 +827,7 @@ public class HelloParser implements Pars - -Search with Apache Solr -http://search.lucidimagination.com/p:tika; - method="get" id="searchform"> - - -provider -Lucid Find -Search-Lucene - - - - + Books about Tika @@ -891,13 +841,10 @@ public class HelloParser implements Pars - Copyright 2023 + Copyright 2024 https://www.apache.org/;>The Apache Software Foundation. Site powered by https://maven.apache.org/;>Apache Maven. - Search powered by - http://www.lucidimagination.com;>Lucid Imagination - and http://sematext.com;>Sematext. - + Apache Tika, Tika, Apache, the Apache feather logo, and the Apache Tika project logo are trademarks of The Apache Software Foundation. Modified: tika/site/publish/0.8/detection.html URL: http://svn.apache.org/viewvc/tika/site/publish/0.8/detection.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/0.8/detection.html (original) +++ tika/site/publish/0.8/detection.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -
svn commit: r1916752 [8/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12/
Modified: tika/site/publish/1.15/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.15/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.15/examples.html (original) +++ tika/site/publish/1.15/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
svn commit: r1916752 [9/47] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12/
Modified: tika/site/publish/1.16/examples.html URL: http://svn.apache.org/viewvc/tika/site/publish/1.16/examples.html?rev=1916752=1916751=1916752=diff == --- tika/site/publish/1.16/examples.html (original) +++ tika/site/publish/1.16/examples.html Tue Apr 2 18:03:41 2024 @@ -34,45 +34,8 @@ @import url("../css/site.css"); - - function selectProvider(form) { -provider = form.elements['searchProvider'].value; -if (provider == "any") { - if (Math.random() > 0.5) { -provider = "lucid"; - } else { -provider = "sl"; - } -} -if (provider == "lucid") { - form.action = "http://find.searchhub.org/p:tika"; -} else if (provider == "sl") { - form.action = "http://search-lucene.com/tika"; -} -days = 90; -date = new Date(); -date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000)); -expires = "; expires=" + date.toGMTString(); -document.cookie = "searchProvider=" + provider + expires + "; path=/"; - } - function initProvider() { -if (document.cookie.length>0) { - cStart=document.cookie.indexOf("searchProvider="); - if (cStart!=-1) { -cStart=cStart + "searchProvider=".length; -cEnd=document.cookie.indexOf(";", cStart); -if (cEnd==-1) { - cEnd=document.cookie.length; -} -provider = unescape(document.cookie.substring(cStart,cEnd)); -document.forms['searchform'].elements['searchProvider'].value = provider; - } -} -document.forms['searchform'].elements['q'].focus(); - } - - + https://tika.apache.org; id="bannerLeft" title="Apache Tika" @@ -116,23 +79,23 @@ The Tika facade, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text @import url('attached-includes/css/shCoreDefault.css'); -public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} +public String parseToStringExample() throws IOException, SAXException, TikaException {Tika tika = new Tika();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) {return tika.parseToString(stream);}} Parsing using the Auto-Detect Parser -For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} +For more control, you can call the Tika Parsers directly. Most likely, you'll want to start out using the Auto-Detect Parser, which automatically figures out what kind of content you have, then calls the appropriate parser for you.public String parseExample() throws IOException, SAXException, TikaException {AutoDetectParser parser = new AutoDetectParser();BodyContentHandler handler = new BodyContentHandler();Metadata metadata = new Metadata();try (InputStream stream = ParsingExample.class.getResourceAsStream("test.doc")) { sp;parser.parse(stream, handler, metadata);return handler.toString();}} Picking different output formats With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html;>ContentHandler you supply to the Parser. Parsing to Plain Text -By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a plain-text string.public String parseToPlainText() throws IOException, SAXException, TikaException {BodyContentHandler handler = new BodyContentHandler();AutoDetectParser parser = new AutoDetectParser();Metadata metadata = new Metadata();try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {parser.parse(stream, handler, metadata);return handler.toString();}} +By using the BodyContentHandler, you can request that Tika return only the content of the document's body as a
(tika) annotated tag 2.9.2-rc2 deleted (was 55a70c070)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to annotated tag 2.9.2-rc2 in repository https://gitbox.apache.org/repos/asf/tika.git *** WARNING: tag 2.9.2-rc2 was deleted! *** tag was 55a70c070 The revisions that were on this annotated tag are still contained in other references; therefore, this change does not discard any commits from the repository.
(tika) annotated tag 2.9.2-rc1 deleted (was 3458c414f)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to annotated tag 2.9.2-rc1 in repository https://gitbox.apache.org/repos/asf/tika.git *** WARNING: tag 2.9.2-rc1 was deleted! *** tag was 3458c414f The revisions that were on this annotated tag are still contained in other references; therefore, this change does not discard any commits from the repository.
(tika) annotated tag 2.9.2 updated (1dbf284b7 -> e331d95cf)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to annotated tag 2.9.2 in repository https://gitbox.apache.org/repos/asf/tika.git *** WARNING: tag 2.9.2 was modified! *** from 1dbf284b7 (commit) to e331d95cf (tag) tagging 1dbf284b7131b13f0ab35162ac5914e2aba7baa6 (commit) replaces 2.9.2-rc1 by tallison on Tue Apr 2 14:00:42 2024 -0400 - Log - Tagging 2.9.2 release --- No new revisions were added by this update. Summary of changes:
svn commit: r68266 - /dev/tika/2.9.2/
Author: tallison Date: Tue Apr 2 17:58:47 2024 New Revision: 68266 Log: release 2.9.2 Removed: dev/tika/2.9.2/
svn commit: r68265 [3/3] - in /release/tika: 2.9.1/ 2.9.2/
Added: release/tika/2.9.2/tika-2.9.2-src.zip == Binary file - no diff available. Propchange: release/tika/2.9.2/tika-2.9.2-src.zip -- svn:mime-type = application/octet-stream Added: release/tika/2.9.2/tika-2.9.2-src.zip.asc == --- release/tika/2.9.2/tika-2.9.2-src.zip.asc (added) +++ release/tika/2.9.2/tika-2.9.2-src.zip.asc Tue Apr 2 17:55:52 2024 @@ -0,0 +1,16 @@ +-BEGIN PGP SIGNATURE- + +iQIzBAABCgAdFiEEGERU+thpd2Dz4A0uSlGkW5RP/VEFAmYC6p4ACgkQSlGkW5RP +/VHkew/8DP2ArFLkVHnWXQWCj5lIaN6j0Mhikcg6JvRjGvN6mpNzsZv9aYUBuGEz +JjGZhyCBGxZVcJgQGobEMgl8HqNsnj6o+CHMNr/MbAje54BriAglQGMiiu4nGEB5 +ifGQzvatEjgwtLJEKnOd+xQFF5Uq0mbm29qp48fKz2fkNdeWma0YzE+A6ZQcdzfx +Kh9D0RxQgdjDz5Sf1z2cTTjwUOj38XnPq5Ak1TIXoatYCohPOGvMyJPgthKVvhJ+ +8IcqbVy7ER9eCrQOfeaS2CvAjnKSeFqw6nO7pG/T1F+9X164XDJuP8c1MxRIj95y +qbo1U8sZ3THvSl6oXXrE8BNIiHsnEiujaDZfw+LsE6Y/XfLnNgOS7FrttKXGLM5e +wRzy9M39WcJHv6gsbWTnF7pEozPtT2eAp3CbIdedJzpDVaL0UKMw9Dpm3A5AiqUg +7Tq4Z/Ecdg2pU5TsNaflo2/oOF7jBFKRh3tuiCesQeNqhlxzpXVKWwmlP59eReGH +pO7Z0IhNGTJv2aPhACkdFCogfIAI+DiPRaN71SKH1mYsJ/eeyEgnupL/cChTw9Zc +ZtHEWWWUxLr6BLXD7K6jCEhrLi8Ca6ozsIrgllkyaoo8h85319zA/mlbO52ez2Gf +ZDDvGiRSX1FDrNvtANhJry6HIhRryXja21gWJT9CgCp4AD1HIAI= +=cW6O +-END PGP SIGNATURE- Added: release/tika/2.9.2/tika-2.9.2-src.zip.sha512 == --- release/tika/2.9.2/tika-2.9.2-src.zip.sha512 (added) +++ release/tika/2.9.2/tika-2.9.2-src.zip.sha512 Tue Apr 2 17:55:52 2024 @@ -0,0 +1 @@ +5ac7b981aa89d44e177dfb457d6f6b73dd54d43641da31e76b3e8bd9dbc236b9d2e6f6958d9182f36cbee6409293f3f21421f9c89837f693f5e10f997e9b063c Added: release/tika/2.9.2/tika-app-2.9.2.jar == Binary file - no diff available. Propchange: release/tika/2.9.2/tika-app-2.9.2.jar -- svn:mime-type = application/octet-stream Added: release/tika/2.9.2/tika-app-2.9.2.jar.asc == --- release/tika/2.9.2/tika-app-2.9.2.jar.asc (added) +++ release/tika/2.9.2/tika-app-2.9.2.jar.asc Tue Apr 2 17:55:52 2024 @@ -0,0 +1,16 @@ +-BEGIN PGP SIGNATURE- + +iQIzBAABCgAdFiEEGERU+thpd2Dz4A0uSlGkW5RP/VEFAmYC6Q4ACgkQSlGkW5RP +/VHUoBAAmnCXYNlpVzqlUGnxqn0WqJ8W0O4gpEDvUztYnHfcFRcDmPf0ZxjziYkS +WURF9J+i/ezK9Mx4j4SLcmGNwPAA3I0z37GCda4bpPZpUsLLBs9/q2kAC+bu2sD5 +8fEJm6InO0Bb1uRN1S0ONAHfbZyWbt0E2pqD60On7KoQW79bdODhaUPxlrWczoMc +vJrzoLGlGOX3zG2HtlZkRE8UG2TNP8J8SgLL26sivUiR97cnZ1U9Yi4/eYz2IZfw +uLVURm7x34PXqNPZ92De8HUJfesiGwQ73WUlRCH6lkNm8HqhhDSayJgEci0Z5suC +ocAiNXzLUmvJDDexREyM7DkVjNJyNJJ8RijZiI9lle+hssWeyaD5Wapg+4cck02Q +wK9k7SQSVu8HvqEqVBMZ+xfK21IUb9wvy04KP0luK/pPdmKR3WG23hIRSrAs0rrc +7qRBwbV5nxNorlQ7maEXoJSD/A9ie/wkpdAAckFVEl6zK1i4tZsIo4V1knMDPR8f +f+GVrwrKq2QRALLRf3AI0Dh+wA4NdDCJ6ADbQ7IIEdipirz1cMTteL3kMlAuQiiY +hhW0VNBWEyWDCGvQ2DBLjXBOC8+an90yMeRwmsk4bKnEna0Gl3nzbHYKYlJ9D2f9 +TeOHCzumByaUNHWLNe7fb5u2w56hOm8MXgQQ2HFTWP0BeYgRof0= +=ckSS +-END PGP SIGNATURE- Added: release/tika/2.9.2/tika-app-2.9.2.jar.sha512 == --- release/tika/2.9.2/tika-app-2.9.2.jar.sha512 (added) +++ release/tika/2.9.2/tika-app-2.9.2.jar.sha512 Tue Apr 2 17:55:52 2024 @@ -0,0 +1 @@ +55fe5fd70fefc2d0ca30abac024d5c659dc62a437741b52276fc7cd90be31f133a0a6e528863e81f642eab3e0c39690e63abb52f580e4261d4ac5bfd52934e25 Added: release/tika/2.9.2/tika-eval-app-2.9.2.jar == Binary file - no diff available. Propchange: release/tika/2.9.2/tika-eval-app-2.9.2.jar -- svn:mime-type = application/octet-stream Added: release/tika/2.9.2/tika-eval-app-2.9.2.jar.asc == --- release/tika/2.9.2/tika-eval-app-2.9.2.jar.asc (added) +++ release/tika/2.9.2/tika-eval-app-2.9.2.jar.asc Tue Apr 2 17:55:52 2024 @@ -0,0 +1,16 @@ +-BEGIN PGP SIGNATURE- + +iQIzBAABCgAdFiEEGERU+thpd2Dz4A0uSlGkW5RP/VEFAmYC6mUACgkQSlGkW5RP +/VHgaA//bUhBGGQRQ4cTJFAFienrAD1mCqpgsHYLd5s/pshOST8M9NyS590FXp69 +NGHxLHpU0PjLbdZEo9lRtM5sNGK79LxVlMcx0oOlCC4ZHM00RJ8bexp1WLHmBt40 +pW2DF0cgT4TBTCLsZbBlpLgOjUeYoadUJSlZwBtgUsScsWtzSZERd8JQoPcXiExM +ncttxP5bxrAUu//d+dV0yyRl04RcvVLL8yU/LFhG1V5JL+2JJd8lFICmzYEJY3rb ++nwsLoGUX9y2Pq1ELK8VFIhnSMPB6NIWYpu2I0Lm0pgs1A9/VtNmo61FUWWgWbA5 +FSlq8PjttJ0oG4LwR80yRhVhAF9lI3I6xUE0vMDPmOIBj75Q8s+SUrQLTWDCX0DZ +aTuynitouof5HFdYK1mOq8Ue9vmkYgr/9UHPzzOY8rVhp+8Z+MV6vLc3hDXo7UTS +sHNHNp0bPLh53ICsdnVrElMd4AtSFe/92aHpMN+JWybdfvLiLGIi57GWswDYeUwc
svn commit: r68265 [2/3] - in /release/tika: 2.9.1/ 2.9.2/
Added: release/tika/2.9.2/CHANGES.txt == --- release/tika/2.9.2/CHANGES.txt (added) +++ release/tika/2.9.2/CHANGES.txt Tue Apr 2 17:55:52 2024 @@ -0,0 +1,3205 @@ +Release 2.9.2 - 3/26/2024 + + * Dependency upgrades including temporary workarounds for regressions in commons-compress. + + * Add detection for OpenSCAD, 3MF, AMF, STL file formats via Robin Schimpf (TIKA-4222, TIKA-4223, + TIKA-4224, TIKA-4225). + +Release 2.9.1 - 10/17/2023 + + * Dependency upgrades including commons-compress to fix CVE-2023-42503. + + * Improve RFC822 detection (TIKA-4153). + + * Enable configuration of "maxJsonStringFieldLength" in TikaConfig to allow users to + avoid DEFAULT_MAX_STRING_LEN exceptions from Jackson (TIKA-4154). + + * Fix bug in DateUtils that stripped timezone information from + incoming Calendar objects (TIKA-4126). + + * The InputStreamDigester now calculates stream length (TIKA-4016). + +Release 2.9.0 - 8/23/2023 + + * With user configuration, the PDFParser can now throw an EncryptedDocumentException + for Microsoft IRM PDF containers with encrypted payloads. Separately, + the PDFParser now throws an EncryptedDocumentException instead of an IOException + if the security handler cannot be found (TIKA-4082). + + * Fix bug that led to duplicate extraction of macros from some OLE2 containers (TIKA-4116). + + * Parse iframe's srcdoc as an embedded file (TIKA-3109). + + * Add detection of warc.gz as a specialization of gz and parse as if a standard WARC (TIKA-4048). + + * Allow users to modify the attachment limit size in the /unpack resource (TIKA-4039) + + * Fixed write limit bug in RecursiveParserWrapper (TIKA-4055). + + * Add mime detection for many files with thanks to Gregory Lepore (TIKA-3992). + + * Fixed iWork 13 keynote detection on files with wrong extension (TIKA-4111). + +Release 2.8.0 - 5/11/2023 + + * Enable counting and/or parsing of incremental updates in PDFs. This + is an experimental feature and may change in later releases (TIKA-4017). + + * Fixed bug that prevented the the loading of CompositeExternalParser in tika-app and + tika-server-standard. This parser will call exiftool and ffmpeg if those are installed, as was + the behavior in Tika 1.x. Exclude org.apache.tika.parser.external.CompositeExternalParser + if you do not want this behavior (TIKA-4022). + + * Removed the shading of tika-parsers-standard-module (TIKA-4038). + + * Enable optional extraction of file system metadata in FileSystemFetcher (TIKA-4035). + + * Allow pretty printing in FileSystemEmitter (TIKA-4034). + + * Add detection for and a new mime type for older postscript-based + Adobe Illustrator "application/illustrator+ps" files (TIKA-3971). + + * Add magic detection for canon raw file types: crw, cr2 and cr3 (TIKA-3991). + + * Add detection for ONIX message files (TIKA-4011). + + * Add detection and a parser for ActiveMime files (TIKA-3987). + + * Add extraction of rendition layout value and version from Epub (TIKA-4013). + + * Improve embedded file extraction from PDFs (TIKA-4012). + + * Improve metadata extraction from WARCs (TIKA-4018). + + * Update to PDFBox 2.0.28 (TIKA-4016). + + * Users may now avoid the ZeroByteFileException via a + setting on the AutoDetectParserConfig (TIKA-3976). + + * Fix bug in closing elements in the presence of elements + in RTF files (TIKA-3972). + + * Improve extraction of embedded file names in .docx (TIKA-3968). + + * Normalize author, title, subject and description to their Dublin Core + properties in the HTMLParser (TIKA-3963). + + +Release 2.7.0 - 1/31/2023 + + * Add SVG detection for svg files that lack the xml header (TIKA-3308). + + * Migrate to a live fork of Universal Charset Detector (TIKA-3213). + + * Improve handling of text-based attachments inside .eml files (TIKA-3959). + + * Add tika-parser-nlp-package to release artifacts (TIKA-3958). + + * Remove need for element in classes that extend ConfigBase (TIKA-3946). + + * Add X-TIKA:embedded_id_path to ensure unique embedded file paths (TIKA-3942). + + * Fix bug that prevented digests when the fallback/EmptyParser + was called (TIKA-3939). + + * Remove log4j 1.2.x (and slf4j-log4j12 which now redirects to slf4j-reload4j) from + all modules (TIKA-3935). + + * Upgrade mime4j to 0.8.9 (TIKA-3950). + + * Refactor date parsing for emails (TIKA-3957) + + * Upgrade to Bouncy Castle 1.71 and jdk18on jars (TIKA-3933). + + * Add a JDBCPipesReporter (TIKA-3931). + + * Add multivalued field strategy option in jdbc-emitter (TIKA-3930). + Default is now 'concatenate' with ', ' as the delimiter. + + * Downgrade logging in PipesClient for each parse from info to debug. + +Release 2.6.0 - 11/3/2022 + + * Add optional Siegfried detector (TIKA-3901). + + * Move
svn commit: r68265 [1/3] - in /release/tika: 2.9.1/ 2.9.2/
Author: tallison Date: Tue Apr 2 17:55:52 2024 New Revision: 68265 Log: update for 2.9.2 release Added: release/tika/2.9.2/ release/tika/2.9.2/CHANGES.txt release/tika/2.9.2/tika-2.9.2-src.zip (with props) release/tika/2.9.2/tika-2.9.2-src.zip.asc release/tika/2.9.2/tika-2.9.2-src.zip.sha512 release/tika/2.9.2/tika-app-2.9.2.jar (with props) release/tika/2.9.2/tika-app-2.9.2.jar.asc release/tika/2.9.2/tika-app-2.9.2.jar.sha512 release/tika/2.9.2/tika-eval-app-2.9.2.jar (with props) release/tika/2.9.2/tika-eval-app-2.9.2.jar.asc release/tika/2.9.2/tika-eval-app-2.9.2.jar.sha512 release/tika/2.9.2/tika-parser-nlp-package-2.9.2.jar (with props) release/tika/2.9.2/tika-parser-nlp-package-2.9.2.jar.asc release/tika/2.9.2/tika-parser-nlp-package-2.9.2.jar.sha512 release/tika/2.9.2/tika-parser-scientific-package-2.9.2.jar (with props) release/tika/2.9.2/tika-parser-scientific-package-2.9.2.jar.asc release/tika/2.9.2/tika-parser-scientific-package-2.9.2.jar.sha512 release/tika/2.9.2/tika-parser-sqlite3-package-2.9.2.jar (with props) release/tika/2.9.2/tika-parser-sqlite3-package-2.9.2.jar.asc release/tika/2.9.2/tika-parser-sqlite3-package-2.9.2.jar.sha512 release/tika/2.9.2/tika-server-standard-2.9.2-bin.tgz (with props) release/tika/2.9.2/tika-server-standard-2.9.2-bin.tgz.asc release/tika/2.9.2/tika-server-standard-2.9.2-bin.tgz.sha512 release/tika/2.9.2/tika-server-standard-2.9.2-bin.zip (with props) release/tika/2.9.2/tika-server-standard-2.9.2-bin.zip.asc release/tika/2.9.2/tika-server-standard-2.9.2-bin.zip.sha512 release/tika/2.9.2/tika-server-standard-2.9.2.jar (with props) release/tika/2.9.2/tika-server-standard-2.9.2.jar.asc release/tika/2.9.2/tika-server-standard-2.9.2.jar.sha512 Removed: release/tika/2.9.1/
(tika) branch TIKA-4207 deleted (was 4d22b3200)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4207 in repository https://gitbox.apache.org/repos/asf/tika.git was 4d22b3200 TIKA-4207 -- remove local paths in unit tests and update RUnpackExtractor The revisions that were on this branch are still contained in other references; therefore, this change does not discard any commits from the repository.
(tika) branch TIKA-4207 updated (a8e25cd1e -> 4d22b3200)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4207 in repository https://gitbox.apache.org/repos/asf/tika.git from a8e25cd1e Merge remote-tracking branch 'origin/main' into TIKA-4207 add 4d22b3200 TIKA-4207 -- remove local paths in unit tests and update RUnpackExtractor No new revisions were added by this update. Summary of changes: .../ParsingEmbeddedDocumentExtractor.java | 8 +++- .../apache/tika/extractor/RUnpackExtractor.java| 52 +++--- .../org/apache/tika/pipes/PipesServerTest.java | 7 +-- 3 files changed, 13 insertions(+), 54 deletions(-)
(tika) 01/01: Merge remote-tracking branch 'origin/main' into TIKA-4207
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4207 in repository https://gitbox.apache.org/repos/asf/tika.git commit a8e25cd1ed82aecccb12ecbe1fe5d74690d311e9 Merge: 8cdaff4b3 b1f8e430f Author: tallison AuthorDate: Thu Mar 28 07:12:12 2024 -0400 Merge remote-tracking branch 'origin/main' into TIKA-4207 .../org/apache/tika/mime/tika-mimetypes.xml| 34 +++- .../java/org/apache/tika/TikaDetectionTest.java| 2 +- tika-parent/pom.xml| 18 +- .../detect/microsoft/ooxml/OPCPackageDetector.java | 47 +++-- .../apache/tika/parser/epub/EncryptionParser.java | 88 -- .../org/apache/tika/parser/epub/EpubParser.java| 193 - .../org/apache/tika/parser/pdf/XFAExtractor.java | 3 + .../org/apache/tika/parser/pdf/PDFParserTest.java | 2 +- .../tika/detect/TestContainerAwareDetector.java| 5 + .../java/org/apache/tika/mime/TestMimeTypes.java | 6 + .../src/test/resources/test-documents/test3mf.3mf | Bin 0 -> 28243 bytes .../resources/test-documents/testSTL-ascii.stl | 16 ++ .../resources/test-documents/testSTL-binary.stl| Bin 0 -> 160 bytes 13 files changed, 255 insertions(+), 159 deletions(-)
(tika) branch TIKA-4207 updated (8cdaff4b3 -> a8e25cd1e)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch TIKA-4207 in repository https://gitbox.apache.org/repos/asf/tika.git from 8cdaff4b3 TIKA-4207 -- further refactorings to simplify class structure and bring back the default ParsingEmbeddedDocumentExtractor add fd23e6c27 Bump io.netty:netty-bom from 4.1.107.Final to 4.1.108.Final add d600259c5 Merge pull request #1677 from apache/dependabot/maven/io.netty-netty-bom-4.1.108.Final add 8e27e31a6 Bump com.google.cloud:google-cloud-storage from 2.36.0 to 2.36.1 add a954511bd Merge pull request #1676 from apache/dependabot/maven/com.google.cloud-google-cloud-storage-2.36.1 add a01e3edb4 Bump aws.version from 1.12.684 to 1.12.685 add daad9b2b1 Merge pull request #1675 from apache/dependabot/maven/aws.version-1.12.685 add 33ac40ccf TIKA-4166: update azure-storage-blob add f3f8404dd Bump commons-logging:commons-logging from 1.3.0 to 1.3.1 add 449f8d192 Merge pull request #1683 from apache/dependabot/maven/commons-logging-commons-logging-1.3.1 add fce53f9df Bump aws.version from 1.12.685 to 1.12.686 add 27f1d87e5 Merge pull request #1682 from apache/dependabot/maven/aws.version-1.12.686 add ba51ff3b6 Bump de.thetaphi:forbiddenapis from 3.6 to 3.7 add 39b5c8a7b Merge pull request #1681 from apache/dependabot/maven/de.thetaphi-forbiddenapis-3.7 add c51ab337d Bump org.ow2.asm:asm from 9.6 to 9.7 add 40bf35574 Merge pull request #1680 from apache/dependabot/maven/org.ow2.asm-asm-9.7 add b9ab4813e TIKA-4171 -- fix regression when field names are missing in the XFAExtractor (#1679) add a559906db TIKA-4219 -- improve epub handling of encrypted non-text-containing items (#1684) add 36e3ba8cd TIKA-4225 -- add detection for amf (#1688) add 3ffbc04f7 TIKA-4224 -- add detection for 3mf (#1689) add c5693624c TIKA-4222 -- add openscad glob (#1690) add b6bfe78d9 Bump aws.version from 1.12.686 to 1.12.687 add 035c18461 Merge pull request #1692 from apache/dependabot/maven/aws.version-1.12.687 add 9d45b69da TIKA-4223 -- add detection of stl (#1691) add e88be05ad TIKA-4219 -- clean up...do not include font names in main package add afc05ee4b Bump com.fasterxml.woodstox:woodstox-core from 6.6.1 to 6.6.2 add e5511a043 Merge pull request #1693 from apache/dependabot/maven/com.fasterxml.woodstox-woodstox-core-6.6.2 add 25badd98b Bump aws.version from 1.12.687 to 1.12.688 add 07f1f4f24 Merge pull request #1694 from apache/dependabot/maven/aws.version-1.12.688 add 1fb5b2622 Bump aws.version from 1.12.688 to 1.12.689 add 4f5dff9a1 Merge pull request #1696 from apache/dependabot/maven/aws.version-1.12.689 add f8c6750c9 Bump com.github.luben:zstd-jni from 1.5.5-11 to 1.5.6-1 add b1f8e430f Merge pull request #1697 from apache/dependabot/maven/com.github.luben-zstd-jni-1.5.6-1 new a8e25cd1e Merge remote-tracking branch 'origin/main' into TIKA-4207 The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: .../org/apache/tika/mime/tika-mimetypes.xml| 34 +++- .../java/org/apache/tika/TikaDetectionTest.java| 2 +- tika-parent/pom.xml| 18 +- .../detect/microsoft/ooxml/OPCPackageDetector.java | 47 +++-- .../apache/tika/parser/epub/EncryptionParser.java | 88 -- .../org/apache/tika/parser/epub/EpubParser.java| 193 - .../org/apache/tika/parser/pdf/XFAExtractor.java | 3 + .../org/apache/tika/parser/pdf/PDFParserTest.java | 2 +- .../tika/detect/TestContainerAwareDetector.java| 5 + .../java/org/apache/tika/mime/TestMimeTypes.java | 6 + .../src/test/resources/test-documents/test3mf.3mf | Bin 0 -> 28243 bytes .../resources/test-documents/testSTL-ascii.stl | 16 ++ .../resources/test-documents/testSTL-binary.stl| Bin 0 -> 160 bytes 13 files changed, 255 insertions(+), 159 deletions(-) delete mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java create mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/test3mf.3mf create mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testSTL-ascii.stl create mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testSTL-binary.stl
(tika) branch TIKA-4207 updated: TIKA-4207 -- further refactorings to simplify class structure and bring back the default ParsingEmbeddedDocumentExtractor
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4207 in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/TIKA-4207 by this push: new 8cdaff4b3 TIKA-4207 -- further refactorings to simplify class structure and bring back the default ParsingEmbeddedDocumentExtractor 8cdaff4b3 is described below commit 8cdaff4b3e2a4a477f753f3bfca751d804721a9d Author: tallison AuthorDate: Thu Mar 28 07:11:46 2024 -0400 TIKA-4207 -- further refactorings to simplify class structure and bring back the default ParsingEmbeddedDocumentExtractor --- ...a => AbstractEmbeddedDocumentBytesHandler.java} | 2 +- ...java => BasicEmbeddedDocumentBytesHandler.java} | 12 ++- ...EmbeddedDocumentByteStoreExtractorFactory.java} | 24 +++--- ...tore.java => EmbeddedDocumentBytesHandler.java} | 4 +- .../tika/extractor/EmbeddedDocumentUtil.java | 2 +- .../ParsingEmbeddedDocumentExtractor.java | 93 +- .../ParsingEmbeddedDocumentExtractorFactory.java | 74 + ...ocumentExtractor.java => RUnpackExtractor.java} | 19 +++-- ...orFactory.java => RUnpackExtractorFactory.java} | 11 ++- .../org/apache/tika/parser/AutoDetectParser.java | 11 ++- .../apache/tika/parser/AutoDetectParserConfig.java | 4 +- .../java/org/apache/tika/pipes/PipesServer.java| 67 +++- .../extractor/EmbeddedDocumentBytesConfig.java | 9 +++ ...a => EmittingEmbeddedDocumentBytesHandler.java} | 15 ++-- .../tika/parser/AutoDetectParserConfigTest.java| 10 +-- .../org/apache/tika/pipes/PipesServerTest.java | 17 +++- .../config/TIKA-4207-embedded-bytes-config.xml | 2 +- .../apache/tika/pipes/TIKA-4207-limit-bytes.xml| 2 +- .../apache/tika/example/ExtractEmbeddedFiles.java | 2 +- .../parser/microsoft/pst/OutlookPSTParserTest.java | 2 +- .../apache/tika/parser/pdf/PDFRenderingTest.java | 2 +- .../resources/configs/tika-config-no-names.xml | 2 +- .../resources/configs/tika-config-with-names.xml | 2 +- 23 files changed, 142 insertions(+), 246 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java similarity index 96% rename from tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java rename to tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java index 15b26451a..3f2f38f94 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java @@ -28,7 +28,7 @@ import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.utils.StringUtils; -public abstract class AbstractEmbeddedDocumentByteStore implements EmbeddedDocumentByteStore { +public abstract class AbstractEmbeddedDocumentBytesHandler implements EmbeddedDocumentBytesHandler { List ids = new ArrayList<>(); diff --git a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java similarity index 80% rename from tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java rename to tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java index d3aeb4507..cf6441b4f 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java @@ -27,9 +27,16 @@ import org.apache.commons.io.input.UnsynchronizedBufferedInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig; -public class BasicEmbeddedDocumentByteStore extends AbstractEmbeddedDocumentByteStore { +/** + * For now, this is an in-memory EmbeddedDocumentBytesHandler that stores + * all the bytes in memory. Users can retrieve the documents with {@link #getDocument(int)}. + * + * We'll need to make this cache to disk at some point if there are many bytes of + * embedded documents. + */ +public class BasicEmbeddedDocumentBytesHandler extends AbstractEmbeddedDocumentBytesHandler { private final EmbeddedDocumentBytesConfig config; -public BasicEmbeddedDocumentByteStore(EmbeddedDocumentBytesConfig config) { +public BasicEmbeddedDocumentBytesHandler(EmbeddedDocumentBytesConfig config) { this.config = config; } //this won't scale, but let's start fully in memory for now; @@ -40,7 +47,6 @@ public class BasicEmbeddedDocumentByteStore extends Abst
svn commit: r68147 [3/3] - /dev/tika/2.9.2/
Added: dev/tika/2.9.2/tika-2.9.2-src.zip == Binary file - no diff available. Propchange: dev/tika/2.9.2/tika-2.9.2-src.zip -- svn:mime-type = application/octet-stream Added: dev/tika/2.9.2/tika-2.9.2-src.zip.asc == --- dev/tika/2.9.2/tika-2.9.2-src.zip.asc (added) +++ dev/tika/2.9.2/tika-2.9.2-src.zip.asc Tue Mar 26 15:42:23 2024 @@ -0,0 +1,16 @@ +-BEGIN PGP SIGNATURE- + +iQIzBAABCgAdFiEEGERU+thpd2Dz4A0uSlGkW5RP/VEFAmYC6p4ACgkQSlGkW5RP +/VHkew/8DP2ArFLkVHnWXQWCj5lIaN6j0Mhikcg6JvRjGvN6mpNzsZv9aYUBuGEz +JjGZhyCBGxZVcJgQGobEMgl8HqNsnj6o+CHMNr/MbAje54BriAglQGMiiu4nGEB5 +ifGQzvatEjgwtLJEKnOd+xQFF5Uq0mbm29qp48fKz2fkNdeWma0YzE+A6ZQcdzfx +Kh9D0RxQgdjDz5Sf1z2cTTjwUOj38XnPq5Ak1TIXoatYCohPOGvMyJPgthKVvhJ+ +8IcqbVy7ER9eCrQOfeaS2CvAjnKSeFqw6nO7pG/T1F+9X164XDJuP8c1MxRIj95y +qbo1U8sZ3THvSl6oXXrE8BNIiHsnEiujaDZfw+LsE6Y/XfLnNgOS7FrttKXGLM5e +wRzy9M39WcJHv6gsbWTnF7pEozPtT2eAp3CbIdedJzpDVaL0UKMw9Dpm3A5AiqUg +7Tq4Z/Ecdg2pU5TsNaflo2/oOF7jBFKRh3tuiCesQeNqhlxzpXVKWwmlP59eReGH +pO7Z0IhNGTJv2aPhACkdFCogfIAI+DiPRaN71SKH1mYsJ/eeyEgnupL/cChTw9Zc +ZtHEWWWUxLr6BLXD7K6jCEhrLi8Ca6ozsIrgllkyaoo8h85319zA/mlbO52ez2Gf +ZDDvGiRSX1FDrNvtANhJry6HIhRryXja21gWJT9CgCp4AD1HIAI= +=cW6O +-END PGP SIGNATURE- Added: dev/tika/2.9.2/tika-2.9.2-src.zip.sha512 == --- dev/tika/2.9.2/tika-2.9.2-src.zip.sha512 (added) +++ dev/tika/2.9.2/tika-2.9.2-src.zip.sha512 Tue Mar 26 15:42:23 2024 @@ -0,0 +1 @@ +5ac7b981aa89d44e177dfb457d6f6b73dd54d43641da31e76b3e8bd9dbc236b9d2e6f6958d9182f36cbee6409293f3f21421f9c89837f693f5e10f997e9b063c Added: dev/tika/2.9.2/tika-app-2.9.2.jar == Binary file - no diff available. Propchange: dev/tika/2.9.2/tika-app-2.9.2.jar -- svn:mime-type = application/octet-stream Added: dev/tika/2.9.2/tika-app-2.9.2.jar.asc == --- dev/tika/2.9.2/tika-app-2.9.2.jar.asc (added) +++ dev/tika/2.9.2/tika-app-2.9.2.jar.asc Tue Mar 26 15:42:23 2024 @@ -0,0 +1,16 @@ +-BEGIN PGP SIGNATURE- + +iQIzBAABCgAdFiEEGERU+thpd2Dz4A0uSlGkW5RP/VEFAmYC6Q4ACgkQSlGkW5RP +/VHUoBAAmnCXYNlpVzqlUGnxqn0WqJ8W0O4gpEDvUztYnHfcFRcDmPf0ZxjziYkS +WURF9J+i/ezK9Mx4j4SLcmGNwPAA3I0z37GCda4bpPZpUsLLBs9/q2kAC+bu2sD5 +8fEJm6InO0Bb1uRN1S0ONAHfbZyWbt0E2pqD60On7KoQW79bdODhaUPxlrWczoMc +vJrzoLGlGOX3zG2HtlZkRE8UG2TNP8J8SgLL26sivUiR97cnZ1U9Yi4/eYz2IZfw +uLVURm7x34PXqNPZ92De8HUJfesiGwQ73WUlRCH6lkNm8HqhhDSayJgEci0Z5suC +ocAiNXzLUmvJDDexREyM7DkVjNJyNJJ8RijZiI9lle+hssWeyaD5Wapg+4cck02Q +wK9k7SQSVu8HvqEqVBMZ+xfK21IUb9wvy04KP0luK/pPdmKR3WG23hIRSrAs0rrc +7qRBwbV5nxNorlQ7maEXoJSD/A9ie/wkpdAAckFVEl6zK1i4tZsIo4V1knMDPR8f +f+GVrwrKq2QRALLRf3AI0Dh+wA4NdDCJ6ADbQ7IIEdipirz1cMTteL3kMlAuQiiY +hhW0VNBWEyWDCGvQ2DBLjXBOC8+an90yMeRwmsk4bKnEna0Gl3nzbHYKYlJ9D2f9 +TeOHCzumByaUNHWLNe7fb5u2w56hOm8MXgQQ2HFTWP0BeYgRof0= +=ckSS +-END PGP SIGNATURE- Added: dev/tika/2.9.2/tika-app-2.9.2.jar.sha512 == --- dev/tika/2.9.2/tika-app-2.9.2.jar.sha512 (added) +++ dev/tika/2.9.2/tika-app-2.9.2.jar.sha512 Tue Mar 26 15:42:23 2024 @@ -0,0 +1 @@ +55fe5fd70fefc2d0ca30abac024d5c659dc62a437741b52276fc7cd90be31f133a0a6e528863e81f642eab3e0c39690e63abb52f580e4261d4ac5bfd52934e25 Added: dev/tika/2.9.2/tika-eval-app-2.9.2.jar == Binary file - no diff available. Propchange: dev/tika/2.9.2/tika-eval-app-2.9.2.jar -- svn:mime-type = application/octet-stream Added: dev/tika/2.9.2/tika-eval-app-2.9.2.jar.asc == --- dev/tika/2.9.2/tika-eval-app-2.9.2.jar.asc (added) +++ dev/tika/2.9.2/tika-eval-app-2.9.2.jar.asc Tue Mar 26 15:42:23 2024 @@ -0,0 +1,16 @@ +-BEGIN PGP SIGNATURE- + +iQIzBAABCgAdFiEEGERU+thpd2Dz4A0uSlGkW5RP/VEFAmYC6mUACgkQSlGkW5RP +/VHgaA//bUhBGGQRQ4cTJFAFienrAD1mCqpgsHYLd5s/pshOST8M9NyS590FXp69 +NGHxLHpU0PjLbdZEo9lRtM5sNGK79LxVlMcx0oOlCC4ZHM00RJ8bexp1WLHmBt40 +pW2DF0cgT4TBTCLsZbBlpLgOjUeYoadUJSlZwBtgUsScsWtzSZERd8JQoPcXiExM +ncttxP5bxrAUu//d+dV0yyRl04RcvVLL8yU/LFhG1V5JL+2JJd8lFICmzYEJY3rb ++nwsLoGUX9y2Pq1ELK8VFIhnSMPB6NIWYpu2I0Lm0pgs1A9/VtNmo61FUWWgWbA5 +FSlq8PjttJ0oG4LwR80yRhVhAF9lI3I6xUE0vMDPmOIBj75Q8s+SUrQLTWDCX0DZ +aTuynitouof5HFdYK1mOq8Ue9vmkYgr/9UHPzzOY8rVhp+8Z+MV6vLc3hDXo7UTS +sHNHNp0bPLh53ICsdnVrElMd4AtSFe/92aHpMN+JWybdfvLiLGIi57GWswDYeUwc +x/bcQ6CtfGyc/dvLs2xeWyPYm6U8AcQm5Rx9LA/KXRHZPTkZ3RhqJzy6f8udLeFB
svn commit: r68147 [2/3] - /dev/tika/2.9.2/
Added: dev/tika/2.9.2/CHANGES.txt == --- dev/tika/2.9.2/CHANGES.txt (added) +++ dev/tika/2.9.2/CHANGES.txt Tue Mar 26 15:42:23 2024 @@ -0,0 +1,3205 @@ +Release 2.9.2 - 3/26/2024 + + * Dependency upgrades including temporary workarounds for regressions in commons-compress. + + * Add detection for OpenSCAD, 3MF, AMF, STL file formats via Robin Schimpf (TIKA-4222, TIKA-4223, + TIKA-4224, TIKA-4225). + +Release 2.9.1 - 10/17/2023 + + * Dependency upgrades including commons-compress to fix CVE-2023-42503. + + * Improve RFC822 detection (TIKA-4153). + + * Enable configuration of "maxJsonStringFieldLength" in TikaConfig to allow users to + avoid DEFAULT_MAX_STRING_LEN exceptions from Jackson (TIKA-4154). + + * Fix bug in DateUtils that stripped timezone information from + incoming Calendar objects (TIKA-4126). + + * The InputStreamDigester now calculates stream length (TIKA-4016). + +Release 2.9.0 - 8/23/2023 + + * With user configuration, the PDFParser can now throw an EncryptedDocumentException + for Microsoft IRM PDF containers with encrypted payloads. Separately, + the PDFParser now throws an EncryptedDocumentException instead of an IOException + if the security handler cannot be found (TIKA-4082). + + * Fix bug that led to duplicate extraction of macros from some OLE2 containers (TIKA-4116). + + * Parse iframe's srcdoc as an embedded file (TIKA-3109). + + * Add detection of warc.gz as a specialization of gz and parse as if a standard WARC (TIKA-4048). + + * Allow users to modify the attachment limit size in the /unpack resource (TIKA-4039) + + * Fixed write limit bug in RecursiveParserWrapper (TIKA-4055). + + * Add mime detection for many files with thanks to Gregory Lepore (TIKA-3992). + + * Fixed iWork 13 keynote detection on files with wrong extension (TIKA-4111). + +Release 2.8.0 - 5/11/2023 + + * Enable counting and/or parsing of incremental updates in PDFs. This + is an experimental feature and may change in later releases (TIKA-4017). + + * Fixed bug that prevented the the loading of CompositeExternalParser in tika-app and + tika-server-standard. This parser will call exiftool and ffmpeg if those are installed, as was + the behavior in Tika 1.x. Exclude org.apache.tika.parser.external.CompositeExternalParser + if you do not want this behavior (TIKA-4022). + + * Removed the shading of tika-parsers-standard-module (TIKA-4038). + + * Enable optional extraction of file system metadata in FileSystemFetcher (TIKA-4035). + + * Allow pretty printing in FileSystemEmitter (TIKA-4034). + + * Add detection for and a new mime type for older postscript-based + Adobe Illustrator "application/illustrator+ps" files (TIKA-3971). + + * Add magic detection for canon raw file types: crw, cr2 and cr3 (TIKA-3991). + + * Add detection for ONIX message files (TIKA-4011). + + * Add detection and a parser for ActiveMime files (TIKA-3987). + + * Add extraction of rendition layout value and version from Epub (TIKA-4013). + + * Improve embedded file extraction from PDFs (TIKA-4012). + + * Improve metadata extraction from WARCs (TIKA-4018). + + * Update to PDFBox 2.0.28 (TIKA-4016). + + * Users may now avoid the ZeroByteFileException via a + setting on the AutoDetectParserConfig (TIKA-3976). + + * Fix bug in closing elements in the presence of elements + in RTF files (TIKA-3972). + + * Improve extraction of embedded file names in .docx (TIKA-3968). + + * Normalize author, title, subject and description to their Dublin Core + properties in the HTMLParser (TIKA-3963). + + +Release 2.7.0 - 1/31/2023 + + * Add SVG detection for svg files that lack the xml header (TIKA-3308). + + * Migrate to a live fork of Universal Charset Detector (TIKA-3213). + + * Improve handling of text-based attachments inside .eml files (TIKA-3959). + + * Add tika-parser-nlp-package to release artifacts (TIKA-3958). + + * Remove need for element in classes that extend ConfigBase (TIKA-3946). + + * Add X-TIKA:embedded_id_path to ensure unique embedded file paths (TIKA-3942). + + * Fix bug that prevented digests when the fallback/EmptyParser + was called (TIKA-3939). + + * Remove log4j 1.2.x (and slf4j-log4j12 which now redirects to slf4j-reload4j) from + all modules (TIKA-3935). + + * Upgrade mime4j to 0.8.9 (TIKA-3950). + + * Refactor date parsing for emails (TIKA-3957) + + * Upgrade to Bouncy Castle 1.71 and jdk18on jars (TIKA-3933). + + * Add a JDBCPipesReporter (TIKA-3931). + + * Add multivalued field strategy option in jdbc-emitter (TIKA-3930). + Default is now 'concatenate' with ', ' as the delimiter. + + * Downgrade logging in PipesClient for each parse from info to debug. + +Release 2.6.0 - 11/3/2022 + + * Add optional Siegfried detector (TIKA-3901). + + * Move OverrideDetector's
svn commit: r68147 [1/3] - /dev/tika/2.9.2/
Author: tallison Date: Tue Mar 26 15:42:23 2024 New Revision: 68147 Log: 2.9.2 rc2 Added: dev/tika/2.9.2/CHANGES.txt dev/tika/2.9.2/tika-2.9.2-src.zip (with props) dev/tika/2.9.2/tika-2.9.2-src.zip.asc dev/tika/2.9.2/tika-2.9.2-src.zip.sha512 dev/tika/2.9.2/tika-app-2.9.2.jar (with props) dev/tika/2.9.2/tika-app-2.9.2.jar.asc dev/tika/2.9.2/tika-app-2.9.2.jar.sha512 dev/tika/2.9.2/tika-eval-app-2.9.2.jar (with props) dev/tika/2.9.2/tika-eval-app-2.9.2.jar.asc dev/tika/2.9.2/tika-eval-app-2.9.2.jar.sha512 dev/tika/2.9.2/tika-parser-nlp-package-2.9.2.jar (with props) dev/tika/2.9.2/tika-parser-nlp-package-2.9.2.jar.asc dev/tika/2.9.2/tika-parser-nlp-package-2.9.2.jar.sha512 dev/tika/2.9.2/tika-parser-scientific-package-2.9.2.jar (with props) dev/tika/2.9.2/tika-parser-scientific-package-2.9.2.jar.asc dev/tika/2.9.2/tika-parser-scientific-package-2.9.2.jar.sha512 dev/tika/2.9.2/tika-parser-sqlite3-package-2.9.2.jar (with props) dev/tika/2.9.2/tika-parser-sqlite3-package-2.9.2.jar.asc dev/tika/2.9.2/tika-parser-sqlite3-package-2.9.2.jar.sha512 dev/tika/2.9.2/tika-server-standard-2.9.2-bin.tgz (with props) dev/tika/2.9.2/tika-server-standard-2.9.2-bin.tgz.asc dev/tika/2.9.2/tika-server-standard-2.9.2-bin.tgz.sha512 dev/tika/2.9.2/tika-server-standard-2.9.2-bin.zip (with props) dev/tika/2.9.2/tika-server-standard-2.9.2-bin.zip.asc dev/tika/2.9.2/tika-server-standard-2.9.2-bin.zip.sha512 dev/tika/2.9.2/tika-server-standard-2.9.2.jar (with props) dev/tika/2.9.2/tika-server-standard-2.9.2.jar.asc dev/tika/2.9.2/tika-server-standard-2.9.2.jar.sha512 Removed: dev/tika/2.9.2/CHANGES-2.9.2.txt
(tika) branch branch_2x updated: [maven-release-plugin] prepare for next development iteration
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_2x in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/branch_2x by this push: new e9f7252d9 [maven-release-plugin] prepare for next development iteration e9f7252d9 is described below commit e9f7252d96a8b9e12ef59912a161b4afe5b334ec Author: tallison AuthorDate: Tue Mar 26 11:08:41 2024 -0400 [maven-release-plugin] prepare for next development iteration --- pom.xml| 4 +- tika-app/pom.xml | 4 +- tika-batch/pom.xml | 4 +- tika-bom/pom.xml | 146 ++--- tika-bundles/pom.xml | 6 +- tika-bundles/tika-bundle-standard/pom.xml | 6 +- tika-core/pom.xml | 4 +- tika-detectors/pom.xml | 2 +- tika-detectors/tika-detector-siegfried/pom.xml | 2 +- tika-eval/pom.xml | 4 +- tika-eval/tika-eval-app/pom.xml| 4 +- tika-eval/tika-eval-core/pom.xml | 4 +- tika-example/pom.xml | 4 +- tika-fuzzing/pom.xml | 2 +- tika-integration-tests/pom.xml | 4 +- .../tika-pipes-kafka-integration-tests/pom.xml | 4 +- .../pom.xml| 4 +- .../tika-pipes-s3-integration-tests/pom.xml| 4 +- .../tika-pipes-solr-integration-tests/pom.xml | 4 +- .../tika-resource-loading-tests/pom.xml| 2 +- tika-java7/pom.xml | 4 +- tika-langdetect/pom.xml| 4 +- tika-langdetect/tika-langdetect-lingo24/pom.xml| 4 +- tika-langdetect/tika-langdetect-mitll-text/pom.xml | 4 +- tika-langdetect/tika-langdetect-opennlp/pom.xml| 4 +- tika-langdetect/tika-langdetect-optimaize/pom.xml | 4 +- .../tika-langdetect-test-commons/pom.xml | 4 +- tika-langdetect/tika-langdetect-tika/pom.xml | 4 +- tika-parent/pom.xml| 6 +- tika-parsers/pom.xml | 4 +- tika-parsers/tika-parsers-extended/pom.xml | 4 +- .../tika-parser-scientific-module/pom.xml | 4 +- .../tika-parser-scientific-package/pom.xml | 4 +- .../tika-parser-sqlite3-module/pom.xml | 4 +- .../tika-parser-sqlite3-package/pom.xml| 4 +- .../pom.xml| 4 +- tika-parsers/tika-parsers-ml/pom.xml | 4 +- .../tika-parsers-ml/tika-age-recogniser/pom.xml| 4 +- tika-parsers/tika-parsers-ml/tika-dl/pom.xml | 4 +- .../tika-parser-advancedmedia-module/pom.xml | 4 +- .../tika-parser-advancedmedia-package/pom.xml | 4 +- .../tika-parsers-ml/tika-parser-nlp-module/pom.xml | 4 +- .../tika-parser-nlp-package/pom.xml| 4 +- .../tika-parsers-ml/tika-transcribe-aws/pom.xml| 4 +- tika-parsers/tika-parsers-standard/pom.xml | 4 +- .../tika-parsers-standard-modules/pom.xml | 4 +- .../tika-parser-apple-module/pom.xml | 4 +- .../tika-parser-audiovideo-module/pom.xml | 4 +- .../tika-parser-cad-module/pom.xml | 4 +- .../tika-parser-code-module/pom.xml| 4 +- .../tika-parser-crypto-module/pom.xml | 4 +- .../tika-parser-digest-commons/pom.xml | 4 +- .../tika-parser-font-module/pom.xml| 4 +- .../tika-parser-html-commons/pom.xml | 4 +- .../tika-parser-html-module/pom.xml| 4 +- .../tika-parser-image-module/pom.xml | 4 +- .../tika-parser-jdbc-commons/pom.xml | 4 +- .../tika-parser-mail-commons/pom.xml | 4 +- .../tika-parser-mail-module/pom.xml| 4 +- .../tika-parser-microsoft-module/pom.xml | 4 +- .../tika-parser-miscoffice-module/pom.xml | 4 +- .../tika-parser-news-module/pom.xml| 4 +- .../tika-parser-ocr-module/pom.xml | 4 +- .../tika-parser-pdf-module/pom.xml | 4 +- .../tika-parser-pkg-module/pom.xml | 4 +- .../tika-parser-text-module/pom.xml| 4 +- .../tika-parser-webarchive-module/pom.xml | 4 +- .../tika-parser-xml-module/pom.xml | 4 +- .../tika-parser-xmp-commons/pom.xml| 4 +- .../tika-parser-zip-commons/pom.xml| 4 +- .../tika-parsers-standard-package/pom.xml | 4 +- tika-pipes/pom.xml | 4 +- tika-pipes/tika-async-cli/pom.xml
(tika) annotated tag 2.9.2-rc2 updated (1dbf284b7 -> 55a70c070)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to annotated tag 2.9.2-rc2 in repository https://gitbox.apache.org/repos/asf/tika.git *** WARNING: tag 2.9.2-rc2 was modified! *** from 1dbf284b7 (commit) to 55a70c070 (tag) tagging 1dbf284b7131b13f0ab35162ac5914e2aba7baa6 (commit) replaces 2.9.2-rc1 by tallison on Tue Mar 26 11:08:40 2024 -0400 - Log - [maven-release-plugin] copy for tag 2.9.2-rc2 --- No new revisions were added by this update. Summary of changes:
(tika) branch branch_2x updated (7a36751e0 -> 1dbf284b7)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a change to branch branch_2x in repository https://gitbox.apache.org/repos/asf/tika.git from 7a36751e0 revert version for rc2 add 1dbf284b7 [maven-release-plugin] prepare release 2.9.2-rc2 No new revisions were added by this update. Summary of changes: pom.xml| 4 +- tika-app/pom.xml | 4 +- tika-batch/pom.xml | 4 +- tika-bom/pom.xml | 146 ++--- tika-bundles/pom.xml | 6 +- tika-bundles/tika-bundle-standard/pom.xml | 6 +- tika-core/pom.xml | 4 +- tika-detectors/pom.xml | 2 +- tika-detectors/tika-detector-siegfried/pom.xml | 2 +- tika-eval/pom.xml | 4 +- tika-eval/tika-eval-app/pom.xml| 4 +- tika-eval/tika-eval-core/pom.xml | 4 +- tika-example/pom.xml | 4 +- tika-fuzzing/pom.xml | 2 +- tika-integration-tests/pom.xml | 4 +- .../tika-pipes-kafka-integration-tests/pom.xml | 4 +- .../pom.xml| 4 +- .../tika-pipes-s3-integration-tests/pom.xml| 4 +- .../tika-pipes-solr-integration-tests/pom.xml | 4 +- .../tika-resource-loading-tests/pom.xml| 2 +- tika-java7/pom.xml | 4 +- tika-langdetect/pom.xml| 4 +- tika-langdetect/tika-langdetect-lingo24/pom.xml| 4 +- tika-langdetect/tika-langdetect-mitll-text/pom.xml | 4 +- tika-langdetect/tika-langdetect-opennlp/pom.xml| 4 +- tika-langdetect/tika-langdetect-optimaize/pom.xml | 4 +- .../tika-langdetect-test-commons/pom.xml | 4 +- tika-langdetect/tika-langdetect-tika/pom.xml | 4 +- tika-parent/pom.xml| 6 +- tika-parsers/pom.xml | 4 +- tika-parsers/tika-parsers-extended/pom.xml | 4 +- .../tika-parser-scientific-module/pom.xml | 4 +- .../tika-parser-scientific-package/pom.xml | 4 +- .../tika-parser-sqlite3-module/pom.xml | 4 +- .../tika-parser-sqlite3-package/pom.xml| 4 +- .../pom.xml| 4 +- tika-parsers/tika-parsers-ml/pom.xml | 4 +- .../tika-parsers-ml/tika-age-recogniser/pom.xml| 4 +- tika-parsers/tika-parsers-ml/tika-dl/pom.xml | 4 +- .../tika-parser-advancedmedia-module/pom.xml | 4 +- .../tika-parser-advancedmedia-package/pom.xml | 4 +- .../tika-parsers-ml/tika-parser-nlp-module/pom.xml | 4 +- .../tika-parser-nlp-package/pom.xml| 4 +- .../tika-parsers-ml/tika-transcribe-aws/pom.xml| 4 +- tika-parsers/tika-parsers-standard/pom.xml | 4 +- .../tika-parsers-standard-modules/pom.xml | 4 +- .../tika-parser-apple-module/pom.xml | 4 +- .../tika-parser-audiovideo-module/pom.xml | 4 +- .../tika-parser-cad-module/pom.xml | 4 +- .../tika-parser-code-module/pom.xml| 4 +- .../tika-parser-crypto-module/pom.xml | 4 +- .../tika-parser-digest-commons/pom.xml | 4 +- .../tika-parser-font-module/pom.xml| 4 +- .../tika-parser-html-commons/pom.xml | 4 +- .../tika-parser-html-module/pom.xml| 4 +- .../tika-parser-image-module/pom.xml | 4 +- .../tika-parser-jdbc-commons/pom.xml | 4 +- .../tika-parser-mail-commons/pom.xml | 4 +- .../tika-parser-mail-module/pom.xml| 4 +- .../tika-parser-microsoft-module/pom.xml | 4 +- .../tika-parser-miscoffice-module/pom.xml | 4 +- .../tika-parser-news-module/pom.xml| 4 +- .../tika-parser-ocr-module/pom.xml | 4 +- .../tika-parser-pdf-module/pom.xml | 4 +- .../tika-parser-pkg-module/pom.xml | 4 +- .../tika-parser-text-module/pom.xml| 4 +- .../tika-parser-webarchive-module/pom.xml | 4 +- .../tika-parser-xml-module/pom.xml | 4 +- .../tika-parser-xmp-commons/pom.xml| 4 +- .../tika-parser-zip-commons/pom.xml| 4 +- .../tika-parsers-standard-package/pom.xml | 4 +- tika-pipes/pom.xml | 4 +- tika-pipes/tika-async-cli/pom.xml | 4 +- tika-pipes/tika-emitters/pom.xml | 4 +- .../tika-emitters/tika-emitter-az-blob/pom.xml | 4 +- tika-pipes/tika-emitters/tika-emitter-fs/pom.xml
(tika) branch branch_2x updated: revert version for rc2
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_2x in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/branch_2x by this push: new 7a36751e0 revert version for rc2 7a36751e0 is described below commit 7a36751e0e15e3a1fa6bafc8f7f44993ebac7fb9 Author: tallison AuthorDate: Tue Mar 26 10:54:36 2024 -0400 revert version for rc2 --- pom.xml| 2 +- tika-app/pom.xml | 2 +- tika-batch/pom.xml | 2 +- tika-bom/pom.xml | 146 ++--- tika-bundles/pom.xml | 4 +- tika-bundles/tika-bundle-standard/pom.xml | 4 +- tika-core/pom.xml | 2 +- tika-detectors/pom.xml | 2 +- tika-detectors/tika-detector-siegfried/pom.xml | 2 +- tika-eval/pom.xml | 2 +- tika-eval/tika-eval-app/pom.xml| 2 +- tika-eval/tika-eval-core/pom.xml | 2 +- tika-example/pom.xml | 2 +- tika-fuzzing/pom.xml | 2 +- tika-integration-tests/pom.xml | 2 +- .../tika-pipes-kafka-integration-tests/pom.xml | 2 +- .../pom.xml| 2 +- .../tika-pipes-s3-integration-tests/pom.xml| 2 +- .../tika-pipes-solr-integration-tests/pom.xml | 2 +- .../tika-resource-loading-tests/pom.xml| 2 +- tika-java7/pom.xml | 2 +- tika-langdetect/pom.xml| 2 +- tika-langdetect/tika-langdetect-lingo24/pom.xml| 2 +- tika-langdetect/tika-langdetect-mitll-text/pom.xml | 2 +- tika-langdetect/tika-langdetect-opennlp/pom.xml| 2 +- tika-langdetect/tika-langdetect-optimaize/pom.xml | 2 +- .../tika-langdetect-test-commons/pom.xml | 2 +- tika-langdetect/tika-langdetect-tika/pom.xml | 2 +- tika-parent/pom.xml| 2 +- tika-parsers/pom.xml | 2 +- tika-parsers/tika-parsers-extended/pom.xml | 2 +- .../tika-parser-scientific-module/pom.xml | 2 +- .../tika-parser-scientific-package/pom.xml | 2 +- .../tika-parser-sqlite3-module/pom.xml | 2 +- .../tika-parser-sqlite3-package/pom.xml| 2 +- .../pom.xml| 2 +- tika-parsers/tika-parsers-ml/pom.xml | 2 +- .../tika-parsers-ml/tika-age-recogniser/pom.xml| 2 +- tika-parsers/tika-parsers-ml/tika-dl/pom.xml | 2 +- .../tika-parser-advancedmedia-module/pom.xml | 2 +- .../tika-parser-advancedmedia-package/pom.xml | 2 +- .../tika-parsers-ml/tika-parser-nlp-module/pom.xml | 2 +- .../tika-parser-nlp-package/pom.xml| 2 +- .../tika-parsers-ml/tika-transcribe-aws/pom.xml| 2 +- tika-parsers/tika-parsers-standard/pom.xml | 2 +- .../tika-parsers-standard-modules/pom.xml | 2 +- .../tika-parser-apple-module/pom.xml | 2 +- .../tika-parser-audiovideo-module/pom.xml | 2 +- .../tika-parser-cad-module/pom.xml | 2 +- .../tika-parser-code-module/pom.xml| 2 +- .../tika-parser-crypto-module/pom.xml | 2 +- .../tika-parser-digest-commons/pom.xml | 2 +- .../tika-parser-font-module/pom.xml| 2 +- .../tika-parser-html-commons/pom.xml | 2 +- .../tika-parser-html-module/pom.xml| 2 +- .../tika-parser-image-module/pom.xml | 2 +- .../tika-parser-jdbc-commons/pom.xml | 2 +- .../tika-parser-mail-commons/pom.xml | 2 +- .../tika-parser-mail-module/pom.xml| 2 +- .../tika-parser-microsoft-module/pom.xml | 2 +- .../tika-parser-miscoffice-module/pom.xml | 2 +- .../tika-parser-news-module/pom.xml| 2 +- .../tika-parser-ocr-module/pom.xml | 2 +- .../tika-parser-pdf-module/pom.xml | 2 +- .../tika-parser-pkg-module/pom.xml | 2 +- .../tika-parser-text-module/pom.xml| 2 +- .../tika-parser-webarchive-module/pom.xml | 2 +- .../tika-parser-xml-module/pom.xml | 2 +- .../tika-parser-xmp-commons/pom.xml| 2 +- .../tika-parser-zip-commons/pom.xml| 2 +- .../tika-parsers-standard-package/pom.xml | 2 +- tika-pipes/pom.xml | 2 +- tika-pipes/tika-async-cli/pom.xml | 2 +- tika-pipes/tika-emitters/pom.xml | 2
(tika) branch branch_2x updated: Revert writing of all file paths for embedded contents of epub (TIKA-4219)
This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_2x in repository https://gitbox.apache.org/repos/asf/tika.git The following commit(s) were added to refs/heads/branch_2x by this push: new a501d0c10 Revert writing of all file paths for embedded contents of epub (TIKA-4219) a501d0c10 is described below commit a501d0c1050dd6a9d0f84df676b90b205ebd7ca7 Author: tallison AuthorDate: Tue Mar 26 10:49:52 2024 -0400 Revert writing of all file paths for embedded contents of epub (TIKA-4219) --- .../src/main/java/org/apache/tika/parser/epub/EpubParser.java | 6 +- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java index b9f74cf3e..c066dd1f1 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java @@ -452,13 +452,9 @@ public class EpubParser extends AbstractParser { xhtml.startElement("div", "class", "embedded"); try { -boolean outputHtml = true; -if (hRefMediaPair.media.contains("font") || hRefMediaPair.href.startsWith("fonts")) { -outputHtml = false; -} embeddedDocumentExtractor .parseEmbedded(stream, new EmbeddedContentHandler(xhtml), embeddedMetadata, -outputHtml); +false); } finally { IOUtils.closeQuietly(stream);