This is an automated email from the ASF dual-hosted git repository.
davsclaus pushed a commit to branch camel-4.10.x
in repository https://gitbox.apache.org/repos/asf/camel.git
The following commit(s) were added to refs/heads/camel-4.10.x by this push:
new 582c22d4255e CAMEL-22757: camel-tika - Upgrade to tika 3.2.3 from
2.9.4 (#20246)
582c22d4255e is described below
commit 582c22d4255ebdf8d7c936495615ad11909e9062
Author: Claus Ibsen <[email protected]>
AuthorDate: Fri Dec 5 19:02:49 2025 +0100
CAMEL-22757: camel-tika - Upgrade to tika 3.2.3 from 2.9.4 (#20246)
---
.../resources/org/apache/camel/catalog/components/tika.json | 2 +-
components/camel-tika/pom.xml | 2 +-
.../resources/META-INF/org/apache/camel/component/tika/tika.json | 2 +-
.../java/org/apache/camel/component/tika/TikaParseOutputFormat.java | 3 ---
.../src/main/java/org/apache/camel/component/tika/TikaProducer.java | 4 ----
.../user-manual/modules/ROOT/pages/camel-4x-upgrade-guide-4_10.adoc | 6 ++++++
parent/pom.xml | 2 +-
7 files changed, 10 insertions(+), 11 deletions(-)
diff --git
a/catalog/camel-catalog/src/generated/resources/org/apache/camel/catalog/components/tika.json
b/catalog/camel-catalog/src/generated/resources/org/apache/camel/catalog/components/tika.json
index 862033d42ce4..c98e16835b2d 100644
---
a/catalog/camel-catalog/src/generated/resources/org/apache/camel/catalog/components/tika.json
+++
b/catalog/camel-catalog/src/generated/resources/org/apache/camel/catalog/components/tika.json
@@ -30,7 +30,7 @@
"properties": {
"operation": { "index": 0, "kind": "path", "displayName": "Operation",
"group": "producer", "label": "", "required": true, "type": "object",
"javaType": "org.apache.camel.component.tika.TikaOperation", "enum": [ "parse",
"detect" ], "deprecated": false, "deprecationNote": "", "autowired": false,
"secret": false, "configurationClass":
"org.apache.camel.component.tika.TikaConfiguration", "configurationField":
"tikaConfiguration", "description": "Operation type" },
"tikaParseOutputEncoding": { "index": 1, "kind": "parameter",
"displayName": "Tika Parse Output Encoding", "group": "producer", "label": "",
"required": false, "type": "string", "javaType": "java.lang.String",
"deprecated": false, "autowired": false, "secret": false, "configurationClass":
"org.apache.camel.component.tika.TikaConfiguration", "configurationField":
"tikaConfiguration", "description": "Tika Parse Output Encoding" },
- "tikaParseOutputFormat": { "index": 2, "kind": "parameter", "displayName":
"Tika Parse Output Format", "group": "producer", "label": "", "required":
false, "type": "object", "javaType":
"org.apache.camel.component.tika.TikaParseOutputFormat", "enum": [ "xml",
"html", "text", "textMain" ], "deprecated": false, "autowired": false,
"secret": false, "defaultValue": "xml", "configurationClass":
"org.apache.camel.component.tika.TikaConfiguration", "configurationField":
"tikaConfiguration", [...]
+ "tikaParseOutputFormat": { "index": 2, "kind": "parameter", "displayName":
"Tika Parse Output Format", "group": "producer", "label": "", "required":
false, "type": "object", "javaType":
"org.apache.camel.component.tika.TikaParseOutputFormat", "enum": [ "xml",
"html", "text" ], "deprecated": false, "autowired": false, "secret": false,
"defaultValue": "xml", "configurationClass":
"org.apache.camel.component.tika.TikaConfiguration", "configurationField":
"tikaConfiguration", "descriptio [...]
"lazyStartProducer": { "index": 3, "kind": "parameter", "displayName":
"Lazy Start Producer", "group": "producer (advanced)", "label":
"producer,advanced", "required": false, "type": "boolean", "javaType":
"boolean", "deprecated": false, "autowired": false, "secret": false,
"defaultValue": false, "description": "Whether the producer should be started
lazy (on the first message). By starting lazy you can use this to allow
CamelContext and routes to startup in situations where a produc [...]
"tikaConfig": { "index": 4, "kind": "parameter", "displayName": "Tika
Config", "group": "advanced", "label": "advanced", "required": false, "type":
"object", "javaType": "org.apache.tika.config.TikaConfig", "deprecated": false,
"autowired": false, "secret": false, "configurationClass":
"org.apache.camel.component.tika.TikaConfiguration", "configurationField":
"tikaConfiguration", "description": "Tika Config" },
"tikaConfigUri": { "index": 5, "kind": "parameter", "displayName": "Tika
Config Uri", "group": "advanced", "label": "advanced", "required": false,
"type": "string", "javaType": "java.lang.String", "deprecated": false,
"autowired": false, "secret": false, "configurationClass":
"org.apache.camel.component.tika.TikaConfiguration", "configurationField":
"tikaConfiguration", "description": "Tika Config Url" }
diff --git a/components/camel-tika/pom.xml b/components/camel-tika/pom.xml
index 2fb821b68eb1..823d94a987e9 100644
--- a/components/camel-tika/pom.xml
+++ b/components/camel-tika/pom.xml
@@ -48,7 +48,7 @@
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-html-commons</artifactId>
+ <artifactId>tika-parser-html-module</artifactId>
<version>${tika-version}</version>
</dependency>
<dependency>
diff --git
a/components/camel-tika/src/generated/resources/META-INF/org/apache/camel/component/tika/tika.json
b/components/camel-tika/src/generated/resources/META-INF/org/apache/camel/component/tika/tika.json
index 862033d42ce4..c98e16835b2d 100644
---
a/components/camel-tika/src/generated/resources/META-INF/org/apache/camel/component/tika/tika.json
+++
b/components/camel-tika/src/generated/resources/META-INF/org/apache/camel/component/tika/tika.json
@@ -30,7 +30,7 @@
"properties": {
"operation": { "index": 0, "kind": "path", "displayName": "Operation",
"group": "producer", "label": "", "required": true, "type": "object",
"javaType": "org.apache.camel.component.tika.TikaOperation", "enum": [ "parse",
"detect" ], "deprecated": false, "deprecationNote": "", "autowired": false,
"secret": false, "configurationClass":
"org.apache.camel.component.tika.TikaConfiguration", "configurationField":
"tikaConfiguration", "description": "Operation type" },
"tikaParseOutputEncoding": { "index": 1, "kind": "parameter",
"displayName": "Tika Parse Output Encoding", "group": "producer", "label": "",
"required": false, "type": "string", "javaType": "java.lang.String",
"deprecated": false, "autowired": false, "secret": false, "configurationClass":
"org.apache.camel.component.tika.TikaConfiguration", "configurationField":
"tikaConfiguration", "description": "Tika Parse Output Encoding" },
- "tikaParseOutputFormat": { "index": 2, "kind": "parameter", "displayName":
"Tika Parse Output Format", "group": "producer", "label": "", "required":
false, "type": "object", "javaType":
"org.apache.camel.component.tika.TikaParseOutputFormat", "enum": [ "xml",
"html", "text", "textMain" ], "deprecated": false, "autowired": false,
"secret": false, "defaultValue": "xml", "configurationClass":
"org.apache.camel.component.tika.TikaConfiguration", "configurationField":
"tikaConfiguration", [...]
+ "tikaParseOutputFormat": { "index": 2, "kind": "parameter", "displayName":
"Tika Parse Output Format", "group": "producer", "label": "", "required":
false, "type": "object", "javaType":
"org.apache.camel.component.tika.TikaParseOutputFormat", "enum": [ "xml",
"html", "text" ], "deprecated": false, "autowired": false, "secret": false,
"defaultValue": "xml", "configurationClass":
"org.apache.camel.component.tika.TikaConfiguration", "configurationField":
"tikaConfiguration", "descriptio [...]
"lazyStartProducer": { "index": 3, "kind": "parameter", "displayName":
"Lazy Start Producer", "group": "producer (advanced)", "label":
"producer,advanced", "required": false, "type": "boolean", "javaType":
"boolean", "deprecated": false, "autowired": false, "secret": false,
"defaultValue": false, "description": "Whether the producer should be started
lazy (on the first message). By starting lazy you can use this to allow
CamelContext and routes to startup in situations where a produc [...]
"tikaConfig": { "index": 4, "kind": "parameter", "displayName": "Tika
Config", "group": "advanced", "label": "advanced", "required": false, "type":
"object", "javaType": "org.apache.tika.config.TikaConfig", "deprecated": false,
"autowired": false, "secret": false, "configurationClass":
"org.apache.camel.component.tika.TikaConfiguration", "configurationField":
"tikaConfiguration", "description": "Tika Config" },
"tikaConfigUri": { "index": 5, "kind": "parameter", "displayName": "Tika
Config Uri", "group": "advanced", "label": "advanced", "required": false,
"type": "string", "javaType": "java.lang.String", "deprecated": false,
"autowired": false, "secret": false, "configurationClass":
"org.apache.camel.component.tika.TikaConfiguration", "configurationField":
"tikaConfiguration", "description": "Tika Config Url" }
diff --git
a/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaParseOutputFormat.java
b/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaParseOutputFormat.java
index 67665d58f69c..0da0d3564061 100644
---
a/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaParseOutputFormat.java
+++
b/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaParseOutputFormat.java
@@ -23,8 +23,6 @@ package org.apache.camel.component.tika;
* <li>xml: Returns Parsed Content as XML.</li>
* <li>html: Returns Parsed Content as HTML.</li>
* <li>text: Returns Parsed Content as Text.</li>
- * <li>textMain: Uses the <a
href="http://code.google.com/p/boilerpipe/">boilerpipe</a> library to
automatically extract
- * the main content from a web page.</li>
* </ul>
*
*/
@@ -32,5 +30,4 @@ public enum TikaParseOutputFormat {
xml,
html,
text,
- textMain;
}
diff --git
a/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaProducer.java
b/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaProducer.java
index 58a663866767..9328dc47d344 100644
---
a/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaProducer.java
+++
b/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaProducer.java
@@ -46,7 +46,6 @@ import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ExpandedTitleContentHandler;
-import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -150,9 +149,6 @@ public class TikaProducer extends DefaultProducer {
case text:
result = new BodyContentHandler(new
OutputStreamWriter(outputStream, this.encoding));
break;
- case textMain:
- result = new BoilerpipeContentHandler(new
OutputStreamWriter(outputStream, this.encoding));
- break;
case html:
result = new
ExpandedTitleContentHandler(getTransformerHandler(outputStream, "html", true));
break;
diff --git
a/docs/user-manual/modules/ROOT/pages/camel-4x-upgrade-guide-4_10.adoc
b/docs/user-manual/modules/ROOT/pages/camel-4x-upgrade-guide-4_10.adoc
index f571dd9d230e..00c34cffad17 100644
--- a/docs/user-manual/modules/ROOT/pages/camel-4x-upgrade-guide-4_10.adoc
+++ b/docs/user-manual/modules/ROOT/pages/camel-4x-upgrade-guide-4_10.adoc
@@ -4,6 +4,12 @@ This document is for helping you upgrade your Apache Camel
application
from Camel 4.x to 4.y. For example, if you are upgrading Camel 4.0 to 4.2,
then you should follow the guides
from both 4.0 to 4.1 and 4.1 to 4.2.
+== Upgrading from 4.10.7 to 4.10.8
+
+=== camel-tika
+
+Upgraded to Tika v3, and removed `textMain` from `tikaParseOutputFormat`
option.
+
== Upgrading from 4.10.2 to 4.10.7
=== camel-file / camel-ftp / camel-smb / camel-azure-files
diff --git a/parent/pom.xml b/parent/pom.xml
index 552d4675618f..86b05de9a57b 100644
--- a/parent/pom.xml
+++ b/parent/pom.xml
@@ -487,7 +487,7 @@
<tahu-version>1.0.13</tahu-version>
<testcontainers-version>1.20.4</testcontainers-version>
<thymeleaf-version>3.1.3.RELEASE</thymeleaf-version>
- <tika-version>2.9.4</tika-version>
+ <tika-version>3.2.3</tika-version>
<twilio-version>10.6.8</twilio-version>
<twitter4j-version>4.1.2</twitter4j-version>
<undertow-version>2.3.20.Final</undertow-version>