This is an automated email from the ASF dual-hosted git repository. rzo1 pushed a commit to branch generative-artifact-creation in repository https://gitbox.apache.org/repos/asf/opennlp-models.git
commit 137267ccb14cf566e40f5afb850a415dbb343759 Author: Richard Zowalla <[email protected]> AuthorDate: Fri May 24 13:44:47 2024 +0200 This is a conceptual draft to propose a way to automatically build JARs from our OpenNLP models during a Maven build. It still relies on the models put on dist.a.o. but enables a way to distribute them via Maven Central for easier consumption within the Java ecosystem. --- .github/workflows/maven.yml | 37 +++++++ .gitignore | 1 + opennlp-models-langdetect/pom.xml | 41 +++++++- .../src/main/resources/langdetect-183.bin | 3 - .../src/main/resources/model.properties | 18 ++++ .../opennlp-models-sendetect-de/pom.xml | 75 +++++++++++++++ .../src/main/resources/model.properties | 18 ++++ .../opennlp-models-sendetect-en/pom.xml | 75 +++++++++++++++ .../src/main/resources/model.properties | 18 ++++ .../opennlp-models-sendetect-fr/pom.xml | 75 +++++++++++++++ .../src/main/resources/model.properties | 18 ++++ .../opennlp-models-sendetect-it/pom.xml | 75 +++++++++++++++ .../src/main/resources/model.properties | 18 ++++ .../opennlp-models-sendetect-nl/pom.xml | 75 +++++++++++++++ .../src/main/resources/model.properties | 18 ++++ opennlp-models-sendetect/pom.xml | 46 +++++++++ opennlp-models-test/pom.xml | 59 ++++++++++++ .../src/main/java/org.apache.opennlp/Main.java | 106 +++++++++++++++++++++ .../src/main/resources/expected-models.txt | 21 ++++ pom.xml | 49 ++++++++++ 20 files changed, 841 insertions(+), 5 deletions(-) diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml new file mode 100644 index 0000000..c995950 --- /dev/null +++ b/.github/workflows/maven.yml @@ -0,0 +1,37 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Java CI + +on: [push, pull_request] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/cache@v3 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ runner.os }}-maven- + - name: Setup Java + uses: actions/setup-java@v3 + with: + distribution: adopt + java-version: 17 + - name: Build with Maven + run: mvn -V clean install --no-transfer-progress \ No newline at end of file diff --git a/.gitignore b/.gitignore index 81ef51f..035c795 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ nbactions.xml nb-configuration.xml *.DS_Store .checkstyle +*.bin \ No newline at end of file diff --git a/opennlp-models-langdetect/pom.xml b/opennlp-models-langdetect/pom.xml index 518b455..7d4933a 100644 --- a/opennlp-models-langdetect/pom.xml +++ b/opennlp-models-langdetect/pom.xml @@ -30,8 +30,45 @@ </parent> <artifactId>opennlp-models-langdetect</artifactId> - <version>0.1-SNAPSHOT</version> - <name>Apache OpenNLP Models Lang-Detect</name> + <name>Apache OpenNLP Models :: Lang-Detect</name> + <properties> + <dist.base>${asf.dist.base}</dist.base> + <model.family>langdetect</model.family> + <model.name>langdetect-183.bin</model.name> + <model.version>1.8.3</model.version> + <model.md5>87be0a1cf60e5d8998e521401a87ca97</model.md5> + </properties> + + <build> + <resources> + <resource> + <directory>src/main/resources</directory> + <filtering>true</filtering> + <includes> + <include>**/model.properties</include> + </includes> + </resource> + <resource> + <directory>src/main/resources</directory> + <includes> + <include>**/*.bin</include> + </includes> + </resource> + </resources> + <plugins> + <plugin> + <groupId>com.googlecode.maven-download-plugin</groupId> + <artifactId>download-maven-plugin</artifactId> + <configuration> + <url>${dist.base}/${model.family}/${model.version}/${model.name}</url> + </configuration> + </plugin> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>build-helper-maven-plugin</artifactId> + </plugin> + </plugins> + </build> </project> diff --git a/opennlp-models-langdetect/src/main/resources/langdetect-183.bin b/opennlp-models-langdetect/src/main/resources/langdetect-183.bin deleted file mode 100644 index 05dc88e..0000000 --- a/opennlp-models-langdetect/src/main/resources/langdetect-183.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2ddf585fac2e02a9dcfb9a4a9cc9417562eaac351be2efb506a2eaa87f19e9d4 -size 10568188 diff --git a/opennlp-models-langdetect/src/main/resources/model.properties b/opennlp-models-langdetect/src/main/resources/model.properties new file mode 100644 index 0000000..023541a --- /dev/null +++ b/opennlp-models-langdetect/src/main/resources/model.properties @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +model.name=${model.name} +model.version=${model.version} +model.md5=${model.md5} \ No newline at end of file diff --git a/opennlp-models-sendetect/opennlp-models-sendetect-de/pom.xml b/opennlp-models-sendetect/opennlp-models-sendetect-de/pom.xml new file mode 100644 index 0000000..e53f035 --- /dev/null +++ b/opennlp-models-sendetect/opennlp-models-sendetect-de/pom.xml @@ -0,0 +1,75 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-models-sendetect</artifactId> + <version>0.1-SNAPSHOT</version> + </parent> + + <artifactId>opennlp-models-sendetect-de</artifactId> + + <name>Apache OpenNLP Models :: Sen-Detect :: German</name> + + <properties> + <dist.base>${asf.dist.base}</dist.base> + <model.family>ud-models-1.0</model.family> + <model.name>opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin</model.name> + <model.version>1.9.3</model.version> + <model.md5>20d335035a6958ec34fef6ceec8e7307</model.md5> + </properties> + + <build> + <resources> + <resource> + <directory>src/main/resources</directory> + <filtering>true</filtering> + <includes> + <include>**/model.properties</include> + </includes> + </resource> + <resource> + <directory>src/main/resources</directory> + <includes> + <include>**/*.bin</include> + </includes> + </resource> + </resources> + <plugins> + <plugin> + <groupId>com.googlecode.maven-download-plugin</groupId> + <artifactId>download-maven-plugin</artifactId> + <configuration> + <url>${dist.base}/${model.family}/${model.name}</url> + </configuration> + </plugin> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>build-helper-maven-plugin</artifactId> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file diff --git a/opennlp-models-sendetect/opennlp-models-sendetect-de/src/main/resources/model.properties b/opennlp-models-sendetect/opennlp-models-sendetect-de/src/main/resources/model.properties new file mode 100644 index 0000000..023541a --- /dev/null +++ b/opennlp-models-sendetect/opennlp-models-sendetect-de/src/main/resources/model.properties @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +model.name=${model.name} +model.version=${model.version} +model.md5=${model.md5} \ No newline at end of file diff --git a/opennlp-models-sendetect/opennlp-models-sendetect-en/pom.xml b/opennlp-models-sendetect/opennlp-models-sendetect-en/pom.xml new file mode 100644 index 0000000..f66bd09 --- /dev/null +++ b/opennlp-models-sendetect/opennlp-models-sendetect-en/pom.xml @@ -0,0 +1,75 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-models-sendetect</artifactId> + <version>0.1-SNAPSHOT</version> + </parent> + + <artifactId>opennlp-models-sendetect-en</artifactId> + + <name>Apache OpenNLP Models :: Sen-Detect :: English</name> + + <properties> + <dist.base>${asf.dist.base}</dist.base> + <model.family>ud-models-1.0</model.family> + <model.name>opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin</model.name> + <model.version>1.9.3</model.version> + <model.md5>5965ada99a2ca77beb8632bb47741b7a</model.md5> + </properties> + + <build> + <resources> + <resource> + <directory>src/main/resources</directory> + <filtering>true</filtering> + <includes> + <include>**/model.properties</include> + </includes> + </resource> + <resource> + <directory>src/main/resources</directory> + <includes> + <include>**/*.bin</include> + </includes> + </resource> + </resources> + <plugins> + <plugin> + <groupId>com.googlecode.maven-download-plugin</groupId> + <artifactId>download-maven-plugin</artifactId> + <configuration> + <url>${dist.base}/${model.family}/${model.name}</url> + </configuration> + </plugin> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>build-helper-maven-plugin</artifactId> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file diff --git a/opennlp-models-sendetect/opennlp-models-sendetect-en/src/main/resources/model.properties b/opennlp-models-sendetect/opennlp-models-sendetect-en/src/main/resources/model.properties new file mode 100644 index 0000000..023541a --- /dev/null +++ b/opennlp-models-sendetect/opennlp-models-sendetect-en/src/main/resources/model.properties @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +model.name=${model.name} +model.version=${model.version} +model.md5=${model.md5} \ No newline at end of file diff --git a/opennlp-models-sendetect/opennlp-models-sendetect-fr/pom.xml b/opennlp-models-sendetect/opennlp-models-sendetect-fr/pom.xml new file mode 100644 index 0000000..dd18ee5 --- /dev/null +++ b/opennlp-models-sendetect/opennlp-models-sendetect-fr/pom.xml @@ -0,0 +1,75 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-models-sendetect</artifactId> + <version>0.1-SNAPSHOT</version> + </parent> + + <artifactId>opennlp-models-sendetect-fr</artifactId> + + <name>Apache OpenNLP Models :: Sen-Detect :: French</name> + + <properties> + <dist.base>${asf.dist.base}</dist.base> + <model.family>ud-models-1.0</model.family> + <model.name>opennlp-1.0-1.9.3fr-ud-ftb-sentence-1.0-1.9.3.bin</model.name> + <model.version>1.9.3</model.version> + <model.md5>771252c520a0dc238af35911c139374c</model.md5> + </properties> + + <build> + <resources> + <resource> + <directory>src/main/resources</directory> + <filtering>true</filtering> + <includes> + <include>**/model.properties</include> + </includes> + </resource> + <resource> + <directory>src/main/resources</directory> + <includes> + <include>**/*.bin</include> + </includes> + </resource> + </resources> + <plugins> + <plugin> + <groupId>com.googlecode.maven-download-plugin</groupId> + <artifactId>download-maven-plugin</artifactId> + <configuration> + <url>${dist.base}/${model.family}/${model.name}</url> + </configuration> + </plugin> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>build-helper-maven-plugin</artifactId> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file diff --git a/opennlp-models-sendetect/opennlp-models-sendetect-fr/src/main/resources/model.properties b/opennlp-models-sendetect/opennlp-models-sendetect-fr/src/main/resources/model.properties new file mode 100644 index 0000000..023541a --- /dev/null +++ b/opennlp-models-sendetect/opennlp-models-sendetect-fr/src/main/resources/model.properties @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +model.name=${model.name} +model.version=${model.version} +model.md5=${model.md5} \ No newline at end of file diff --git a/opennlp-models-sendetect/opennlp-models-sendetect-it/pom.xml b/opennlp-models-sendetect/opennlp-models-sendetect-it/pom.xml new file mode 100644 index 0000000..47aa391 --- /dev/null +++ b/opennlp-models-sendetect/opennlp-models-sendetect-it/pom.xml @@ -0,0 +1,75 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-models-sendetect</artifactId> + <version>0.1-SNAPSHOT</version> + </parent> + + <artifactId>opennlp-models-sendetect-it</artifactId> + + <name>Apache OpenNLP Models :: Sen-Detect :: Italian</name> + + <properties> + <dist.base>${asf.dist.base}</dist.base> + <model.family>ud-models-1.0</model.family> + <model.name>opennlp-it-ud-vit-sentence-1.0-1.9.3.bin</model.name> + <model.version>1.9.3</model.version> + <model.md5>3083dc13ba071c5aca94f81eeed6c097</model.md5> + </properties> + + <build> + <resources> + <resource> + <directory>src/main/resources</directory> + <filtering>true</filtering> + <includes> + <include>**/model.properties</include> + </includes> + </resource> + <resource> + <directory>src/main/resources</directory> + <includes> + <include>**/*.bin</include> + </includes> + </resource> + </resources> + <plugins> + <plugin> + <groupId>com.googlecode.maven-download-plugin</groupId> + <artifactId>download-maven-plugin</artifactId> + <configuration> + <url>${dist.base}/${model.family}/${model.name}</url> + </configuration> + </plugin> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>build-helper-maven-plugin</artifactId> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file diff --git a/opennlp-models-sendetect/opennlp-models-sendetect-it/src/main/resources/model.properties b/opennlp-models-sendetect/opennlp-models-sendetect-it/src/main/resources/model.properties new file mode 100644 index 0000000..023541a --- /dev/null +++ b/opennlp-models-sendetect/opennlp-models-sendetect-it/src/main/resources/model.properties @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +model.name=${model.name} +model.version=${model.version} +model.md5=${model.md5} \ No newline at end of file diff --git a/opennlp-models-sendetect/opennlp-models-sendetect-nl/pom.xml b/opennlp-models-sendetect/opennlp-models-sendetect-nl/pom.xml new file mode 100644 index 0000000..ffd349e --- /dev/null +++ b/opennlp-models-sendetect/opennlp-models-sendetect-nl/pom.xml @@ -0,0 +1,75 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-models-sendetect</artifactId> + <version>0.1-SNAPSHOT</version> + </parent> + + <artifactId>opennlp-models-sendetect-nl</artifactId> + + <name>Apache OpenNLP Models :: Sen-Detect :: Dutch</name> + + <properties> + <dist.base>${asf.dist.base}</dist.base> + <model.family>ud-models-1.0</model.family> + <model.name>opennlp-nl-ud-alpino-sentence-1.0-1.9.3.bin</model.name> + <model.version>1.9.3</model.version> + <model.md5>ed160f2cf99b249017d7fc3d3ad8c6b7</model.md5> + </properties> + + <build> + <resources> + <resource> + <directory>src/main/resources</directory> + <filtering>true</filtering> + <includes> + <include>**/model.properties</include> + </includes> + </resource> + <resource> + <directory>src/main/resources</directory> + <includes> + <include>**/*.bin</include> + </includes> + </resource> + </resources> + <plugins> + <plugin> + <groupId>com.googlecode.maven-download-plugin</groupId> + <artifactId>download-maven-plugin</artifactId> + <configuration> + <url>${dist.base}/${model.family}/${model.name}</url> + </configuration> + </plugin> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>build-helper-maven-plugin</artifactId> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file diff --git a/opennlp-models-sendetect/opennlp-models-sendetect-nl/src/main/resources/model.properties b/opennlp-models-sendetect/opennlp-models-sendetect-nl/src/main/resources/model.properties new file mode 100644 index 0000000..023541a --- /dev/null +++ b/opennlp-models-sendetect/opennlp-models-sendetect-nl/src/main/resources/model.properties @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +model.name=${model.name} +model.version=${model.version} +model.md5=${model.md5} \ No newline at end of file diff --git a/opennlp-models-sendetect/pom.xml b/opennlp-models-sendetect/pom.xml new file mode 100644 index 0000000..61cb86d --- /dev/null +++ b/opennlp-models-sendetect/pom.xml @@ -0,0 +1,46 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-models</artifactId> + <version>0.1-SNAPSHOT</version> + </parent> + + <artifactId>opennlp-models-sendetect</artifactId> + + <name>Apache OpenNLP Models :: Sen-Detect</name> + + <packaging>pom</packaging> + + <modules> + <module>opennlp-models-sendetect-de</module> + <module>opennlp-models-sendetect-it</module> + <module>opennlp-models-sendetect-en</module> + <module>opennlp-models-sendetect-fr</module> + <module>opennlp-models-sendetect-nl</module> + </modules> + +</project> \ No newline at end of file diff --git a/opennlp-models-test/pom.xml b/opennlp-models-test/pom.xml new file mode 100644 index 0000000..303f128 --- /dev/null +++ b/opennlp-models-test/pom.xml @@ -0,0 +1,59 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-models</artifactId> + <version>0.1-SNAPSHOT</version> + </parent> + + <artifactId>opennlp-models-test</artifactId> + <name>Apache OpenNLP Models :: Tests</name> + + <build> + <plugins> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>exec-maven-plugin</artifactId> + <version>3.3.0</version> + <executions> + <execution> + <phase>verify</phase> + <goals> + <goal>java</goal> + </goals> + </execution> + </executions> + <configuration> + <mainClass>org.apache.opennlp.Main</mainClass> + <arguments> + <argument>${project.basedir}</argument> + </arguments> + </configuration> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file diff --git a/opennlp-models-test/src/main/java/org.apache.opennlp/Main.java b/opennlp-models-test/src/main/java/org.apache.opennlp/Main.java new file mode 100644 index 0000000..ed446a6 --- /dev/null +++ b/opennlp-models-test/src/main/java/org.apache.opennlp/Main.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.opennlp; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.jar.JarFile; +import java.util.regex.Pattern; +import java.util.stream.Stream; + + +public class Main { + + public static void main(String[] args) { + if (args.length != 1) { + throw new IllegalArgumentException("This tool expects at least one argument"); + } + System.err.println("Executing basic model validation checks."); + + final Path testBaseDir = Path.of(args[0]); + final Path projectDir = testBaseDir.getParent(); + final List<String> expectedModels = getExpectedModels(); + + final String pattern = "opennlp-models.*\\.jar"; + + final List<Path> availableModelJars = getAvailableModelJars(pattern, testBaseDir, projectDir); + + if (expectedModels.size() != availableModelJars.size()) { + throw new IllegalArgumentException("Detected a mismatch between " + + "expected and available models! " + + "Expected: " + expectedModels.size() + + "; Actual: " + availableModelJars.size()); + } + + for (String model : expectedModels) { + boolean found; + for (Path availableJar : availableModelJars) { + found = isModelInJar(availableJar, model); + if (found) { + return; + } + } + throw new IllegalArgumentException( + "Expected model '" + model + "' could not be found inside the generated JAR files!"); + } + + + } + + public static boolean isModelInJar(Path jarFilePath, String expectedModel) { + try (JarFile jarFile = new JarFile(jarFilePath.toFile())) { + return jarFile.stream() + .anyMatch(entry -> entry.getName().equals(expectedModel)); + } catch (IOException e) { + throw new RuntimeException("Failed to read the JAR file: " + jarFilePath, e); + } + } + + private static List<Path> getAvailableModelJars(String pattern, Path testDir, Path projectDir) { + final Pattern regexPattern = Pattern.compile(pattern); + try (Stream<Path> stream = Files.walk(projectDir)) { + return stream + .filter(Files::isRegularFile) + .filter(path -> !path.startsWith(testDir)) + .filter(path -> regexPattern.matcher(path.getFileName().toString()).matches()) + .toList(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private static List<String> getExpectedModels() { + try (InputStream inputStream = Thread.currentThread().getContextClassLoader().getResourceAsStream("expected-models.txt")) { + if (inputStream == null) { + throw new IllegalArgumentException("Expected model file could not be found!"); + } + + try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream))) { + return reader.lines() + .filter(line -> !line.startsWith("#") && !line.trim().isEmpty()) + .toList(); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } +} diff --git a/opennlp-models-test/src/main/resources/expected-models.txt b/opennlp-models-test/src/main/resources/expected-models.txt new file mode 100644 index 0000000..68c678a --- /dev/null +++ b/opennlp-models-test/src/main/resources/expected-models.txt @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +langdetect-183.bin +opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin +opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin +opennlp-1.0-1.9.3fr-ud-ftb-sentence-1.0-1.9.3.bin +opennlp-nl-ud-alpino-sentence-1.0-1.9.3.bin +opennlp-nl-ud-alpino-sentence-1.0-1.9.3.bin \ No newline at end of file diff --git a/pom.xml b/pom.xml index a931648..9799f97 100644 --- a/pom.xml +++ b/pom.xml @@ -96,6 +96,14 @@ <java.version>17</java.version> <maven.version>3.3.9</maven.version> <enforcer.plugin.version>3.3.0</enforcer.plugin.version> + + <junit.version>5.10.1</junit.version> + + <asf.dist.base>https://dist.apache.org/repos/dist/release/opennlp/models/</asf.dist.base> + <sf.dist.base>https://opennlp.sourceforge.net/models-1.5/</sf.dist.base> + + <!-- set a fixed value here to enable reproducable builds --> + <project.build.outputTimestamp>2024-01-01T00:00:00Z</project.build.outputTimestamp> </properties> <build> @@ -111,6 +119,45 @@ <mavenExecutorId>forked-path</mavenExecutorId> </configuration> </plugin> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>build-helper-maven-plugin</artifactId> + <version>3.6.0</version> + <executions> + <execution> + <id>add-resource</id> + <phase>generate-resources</phase> + <goals> + <goal>add-resource</goal> + </goals> + <configuration> + <resources> + <resource> + <directory>${project.build.directory}/models/</directory> + </resource> + </resources> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>com.googlecode.maven-download-plugin</groupId> + <artifactId>download-maven-plugin</artifactId> + <version>1.9.0</version> + <executions> + <execution> + <id>download-model</id> + <phase>generate-resources</phase> + <goals> + <goal>wget</goal> + </goals> + </execution> + </executions> + <configuration> + <outputDirectory>${project.build.directory}/models</outputDirectory> + <md5>${model.md5}</md5> + </configuration> + </plugin> </plugins> </pluginManagement> @@ -201,6 +248,8 @@ <modules> <module>opennlp-models-langdetect</module> + <module>opennlp-models-sendetect</module> + <module>opennlp-models-test</module> </modules> </project>
