This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4138 in repository https://gitbox.apache.org/repos/asf/tika.git
commit cbc46ee9b5295bf14541da8d1f016261c5e30196 Author: tallison <[email protected]> AuthorDate: Fri Sep 22 10:31:47 2023 -0400 TIKA-4138 -- move BoilerpipeContentHandler --- CHANGES.txt | 5 ++ pom.xml | 1 + tika-app/pom.xml | 2 +- tika-bom/pom.xml | 2 +- tika-bundles/tika-bundle-standard/pom.xml | 2 +- tika-handlers/README.md | 2 + tika-handlers/pom.xml | 48 ++++++++++++++ .../tika-handler-boilerpipe/pom.xml | 26 ++++++-- .../sax/boilerpipe/BoilerpipeContentHandler.java | 0 .../tika-parsers-standard-modules/pom.xml | 1 - .../tika-parser-html-commons/pom.xml | 74 ---------------------- .../tika-parsers-standard-package/pom.xml | 2 +- tika-server/tika-server-core/pom.xml | 2 +- tika-server/tika-server-standard/pom.xml | 6 +- 14 files changed, 86 insertions(+), 87 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 30c137609..408e42676 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,7 +1,12 @@ Release 3.0.0-BETA - ?? + BREAKING CHANGES + * Require Java 11 (TIKA-4128). + * The boilerpipe handler has been moved to tika-handler-boiler-pipe + + Other Changes/Updates * Fix bug in DateUtils that stripped timezone information from incoming Calendar objects (TIKA-4126). diff --git a/pom.xml b/pom.xml index ab6b22afa..31f025576 100644 --- a/pom.xml +++ b/pom.xml @@ -54,6 +54,7 @@ <module>tika-example</module> <module>tika-java7</module> <module>tika-detectors</module> + <module>tika-handlers</module> </modules> <profiles> diff --git a/tika-app/pom.xml b/tika-app/pom.xml index 9a48d2ea9..68ac79477 100644 --- a/tika-app/pom.xml +++ b/tika-app/pom.xml @@ -45,7 +45,7 @@ </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-parser-html-commons</artifactId> + <artifactId>tika-handler-boilerpipe</artifactId> <version>${project.version}</version> </dependency> <dependency> diff --git a/tika-bom/pom.xml b/tika-bom/pom.xml index ba2e19d73..5e1aca01e 100644 --- a/tika-bom/pom.xml +++ b/tika-bom/pom.xml @@ -222,7 +222,7 @@ </dependency> <dependency> <groupId>org.apache.tika</groupId> - <artifactId>tika-parser-html-commons</artifactId> + <artifactId>tika-handler-boilerpipe</artifactId> <version>3.0.0-SNAPSHOT</version> </dependency> <dependency> diff --git a/tika-bundles/tika-bundle-standard/pom.xml b/tika-bundles/tika-bundle-standard/pom.xml index db605c044..1e18b1cb0 100644 --- a/tika-bundles/tika-bundle-standard/pom.xml +++ b/tika-bundles/tika-bundle-standard/pom.xml @@ -58,7 +58,7 @@ </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-parser-html-commons</artifactId> + <artifactId>tika-handler-boilerpipe</artifactId> <version>${project.version}</version> </dependency> <dependency> diff --git a/tika-handlers/README.md b/tika-handlers/README.md new file mode 100644 index 000000000..bb45651b3 --- /dev/null +++ b/tika-handlers/README.md @@ -0,0 +1,2 @@ +This package is intended to hold non-standard handlers. These may have dependencies that some don't want, +or they may have a focus that isn't general enough to warrant adding them to tika-core \ No newline at end of file diff --git a/tika-handlers/pom.xml b/tika-handlers/pom.xml new file mode 100644 index 000000000..fcab3eb20 --- /dev/null +++ b/tika-handlers/pom.xml @@ -0,0 +1,48 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parent</artifactId> + <version>3.0.0-SNAPSHOT</version> + <relativePath>../tika-parent/pom.xml</relativePath> + </parent> + + <artifactId>tika-handlers</artifactId> + + <name>Apache Tika handlers</name> + <packaging>pom</packaging> + + <modules> + <module>tika-handler-boilerpipe</module> + </modules> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <scope>provided</scope> + </dependency> + </dependencies> +</project> \ No newline at end of file diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/README.md b/tika-handlers/tika-handler-boilerpipe/pom.xml similarity index 51% rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/README.md rename to tika-handlers/tika-handler-boilerpipe/pom.xml index 82fb00a47..05d0b69b3 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/README.md +++ b/tika-handlers/tika-handler-boilerpipe/pom.xml @@ -1,4 +1,5 @@ -<!--- +<?xml version="1.0" encoding="UTF-8"?> +<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information @@ -16,7 +17,24 @@ specific language governing permissions and limitations under the License. --> -This module only contains the BoilerPipeContentHandler. The boilerpipe dependency is no -longer maintained and contains clashes with NekoHTML. +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache.tika</groupId> + <artifactId>tika-handlers</artifactId> + <version>3.0.0-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> -In Tika 3.x, we should rename this module to tika-handler-boilerpipe or similar. \ No newline at end of file + <artifactId>tika-handler-boilerpipe</artifactId> + + <dependencies> + <dependency> + <groupId>de.l3s.boilerpipe</groupId> + <artifactId>boilerpipe</artifactId> + <version>1.1.0</version> + </dependency> + </dependencies> +</project> \ No newline at end of file diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/src/main/java/org/apache/tika/sax/boilerpipe/BoilerpipeContentHandler.java b/tika-handlers/tika-handler-boilerpipe/src/main/java/org/apache/tika/sax/boilerpipe/BoilerpipeContentHandler.java similarity index 100% rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/src/main/java/org/apache/tika/sax/boilerpipe/BoilerpipeContentHandler.java rename to tika-handlers/tika-handler-boilerpipe/src/main/java/org/apache/tika/sax/boilerpipe/BoilerpipeContentHandler.java diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml index 5fb547f4e..6b163ea3e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml @@ -44,7 +44,6 @@ </dependency> </dependencies> <modules> - <module>tika-parser-html-commons</module> <module>tika-parser-jdbc-commons</module> <module>tika-parser-digest-commons</module> <module>tika-parser-mail-commons</module> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/pom.xml deleted file mode 100644 index 7e7a403bc..000000000 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/pom.xml +++ /dev/null @@ -1,74 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> - <parent> - <artifactId>tika-parsers-standard-modules</artifactId> - <groupId>org.apache.tika</groupId> - <version>3.0.0-SNAPSHOT</version> - </parent> - <modelVersion>4.0.0</modelVersion> - - <artifactId>tika-parser-html-commons</artifactId> - <name>Apache Tika html commons</name> - - <dependencies> - <dependency> - <groupId>de.l3s.boilerpipe</groupId> - <artifactId>boilerpipe</artifactId> - <version>${boilerpipe.version}</version> - </dependency> - </dependencies> - <build> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-jar-plugin</artifactId> - <configuration> - <archive> - <manifestEntries> - <Automatic-Module-Name>org.apache.tika.sax.boilerpipe</Automatic-Module-Name> - </manifestEntries> - </archive> - </configuration> - <executions> - <execution> - <goals> - <goal>test-jar</goal> - </goals> - </execution> - </executions> - </plugin> - <plugin> - <groupId>org.apache.rat</groupId> - <artifactId>apache-rat-plugin</artifactId> - <version>${rat.version}</version> - <configuration> - <excludes> - <exclude>README.md</exclude> - </excludes> - </configuration> - </plugin> - </plugins> - </build> - - <scm> - <tag>2.2.1-rc2</tag> - </scm> -</project> \ No newline at end of file diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml index 4de5eeec4..cb23c96d7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml @@ -186,7 +186,7 @@ </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-parser-html-commons</artifactId> + <artifactId>tika-handler-boilerpipe</artifactId> <version>${project.version}</version> <scope>test</scope> </dependency> diff --git a/tika-server/tika-server-core/pom.xml b/tika-server/tika-server-core/pom.xml index b6794abb7..69a88523e 100644 --- a/tika-server/tika-server-core/pom.xml +++ b/tika-server/tika-server-core/pom.xml @@ -54,7 +54,7 @@ </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-parser-html-commons</artifactId> + <artifactId>tika-handler-boilerpipe</artifactId> <version>${project.version}</version> </dependency> <dependency> diff --git a/tika-server/tika-server-standard/pom.xml b/tika-server/tika-server-standard/pom.xml index a6b2f9b72..c38c40f50 100644 --- a/tika-server/tika-server-standard/pom.xml +++ b/tika-server/tika-server-standard/pom.xml @@ -50,8 +50,8 @@ </exclusions> </dependency> <dependency> - <groupId>org.apache.tika</groupId> - <artifactId>tika-parser-html-commons</artifactId> + <groupId>${project.groupId}</groupId> + <artifactId>tika-handler-boilerpipe</artifactId> <version>${project.version}</version> </dependency> <dependency> @@ -128,7 +128,7 @@ <exclude>org.apache.tika:tika-parsers-standard-package:jar:</exclude> <exclude>org.apache.tika:tika-serialization:jar:</exclude> <exclude>org.apache.tika:tika-langdetect-optimaize:jar:</exclude> - <exclude>org.apache.tika:tika-parser-html-commons:jar:</exclude> + <exclude>org.apache.tika:tika-handler-boilerpipe:jar:</exclude> <exclude>org.apache.tika:tika-parser-digest-commons:jar:</exclude> <exclude>org.apache.tika:tika-parser-zip-commons:jar:</exclude> <exclude>commons-codec:commons-codec:jar:</exclude>
