This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp-models.git


The following commit(s) were added to refs/heads/main by this push:
     new b020964  OPENNLP-1638 - Add initial training script (#26)
b020964 is described below

commit b020964ded48a9679dcf8c04d614a6f08329b0fa
Author: Richard Zowalla <[email protected]>
AuthorDate: Fri Nov 8 06:27:56 2024 +0100

    OPENNLP-1638 - Add initial training script (#26)
---
 .../opennlp-models-training-ud/pom.xml             |  35 +++++++
 .../src/main/resources/ud-train.sh                 | 115 +++++++++++++++++++++
 opennlp-models-training/pom.xml                    |  41 ++++++++
 pom.xml                                            |   1 +
 4 files changed, 192 insertions(+)

diff --git a/opennlp-models-training/opennlp-models-training-ud/pom.xml 
b/opennlp-models-training/opennlp-models-training-ud/pom.xml
new file mode 100644
index 0000000..949cab6
--- /dev/null
+++ b/opennlp-models-training/opennlp-models-training-ud/pom.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0";
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+    <modelVersion>4.0.0</modelVersion>
+    <parent>
+        <groupId>org.apache.opennlp</groupId>
+        <artifactId>opennlp-models-training</artifactId>
+        <version>1.1.1-SNAPSHOT</version>
+    </parent>
+
+    <artifactId>opennlp-models-training-ud</artifactId>
+    <name>Apache OpenNLP Models :: Training :: Universal Dependencies</name>
+
+</project>
\ No newline at end of file
diff --git 
a/opennlp-models-training/opennlp-models-training-ud/src/main/resources/ud-train.sh
 
b/opennlp-models-training/opennlp-models-training-ud/src/main/resources/ud-train.sh
new file mode 100755
index 0000000..2ed92af
--- /dev/null
+++ 
b/opennlp-models-training/opennlp-models-training-ud/src/main/resources/ud-train.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+# This script facilitates training OpenNLP models on Universal Dependencies 
(UD) data.
+
+# Script configuration
+UD_HOME="./"
+OPENNLP_VERSION="opennlp-2.4.0"
+OPENNLP_VERSION_NUMERIC="2.4.0"
+OPENNLP_MODEL_VERSION="1.1"
+OPENNLP_HOME="./apache-opennlp-2.4.0"
+OUTPUT_MODELS="./ud-models-2.4.0"
+GPG_PUBLIC_KEY="" # the public key from the OPENNLP KEYS file in short form.
+EVAL_AFTER_TRAINING="true"
+CREATE_RELEASE="true"
+ENCODING="UTF-8"
+
+
+# Model(s) to train
+declare -a MODELS=("English|en|EWT" "Dutch|nl|Alpino" "French|fr|GSD" 
"German|de|GSD" "Italian|it|VIT" "Bulgarian|bg|BTB" "Czech|cs|PDT" 
"Croatian|hr|SET" "Danish|da|DDT" "Estonian|et|EDT" "Finnish|fi|TDT" 
"Latvian|lv|LVTB" "Norwegian|no|Bokmaal" "Polish|pl|PDB" "Portuguese|pt|GSD" 
"Romanian|ro|RRT" "Russian|ru|GSD" "Serbian|sr|SET" "Slovenian|sl|SSJ" 
"Spanish|es|GSD" "Slovak|sk|SNK" "Swedish|sv|Talbanken" "Ukrainian|uk|IU")
+
+# Create output directory
+mkdir -p ${OUTPUT_MODELS}
+
+for i in "${MODELS[@]}"
+do
+
+  echo $i
+  LANG=`echo $i | cut -d'|' -f1`
+  LANGCODE=`echo $i | cut -d'|' -f2`
+  SUBSET=`echo $i | cut -d'|' -f3`
+  SUBSETLC=`echo ${SUBSET} | tr '[:upper:]' '[:lower:]'`
+
+  # Tokenizer model
+  echo -e "\nTraining tokenizer model ${SUBSET} ${LANG}..."
+  ${OPENNLP_HOME}/bin/opennlp TokenizerTrainer.conllu -model 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 -lang ${LANGCODE} -data 
./ud-treebanks-v2.14/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-train.conllu
 -encoding ${ENCODING} > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.train
+
+  if [ ${EVAL_AFTER_TRAINING} == "true" ]; then
+    echo -e "\nEvaluating tokenizer model ${SUBSET} ${LANG}..."
+    ${OPENNLP_HOME}/bin/opennlp TokenizerMEEvaluator.conllu -model 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 -data 
./ud-treebanks-v2.14/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-test.conllu
 -encoding ${ENCODING} > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.eval
+  fi
+
+  if [ ${CREATE_RELEASE} == "true" ]; then
+    echo -e "\nCreating hashes and ASC signature for tokenizer model ${SUBSET} 
${LANG}..."
+    sha512sum 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha512
+    sha256sum 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha256
+    gpg --default-key $GPG_PUBLIC_KEY --armor --output 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.asc
 --detach-sign 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
+  fi
+  
+  # Sentence model
+  echo -e "\nTraining sentence model ${SUBSET} ${LANG}..."
+  ${OPENNLP_HOME}/bin/opennlp SentenceDetectorTrainer.conllu -model 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 -lang ${LANGCODE} -data 
./ud-treebanks-v2.14/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-train.conllu
 -encoding ${ENCODING} -sentencesPerSample 10 > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.train
+
+  if [ ${EVAL_AFTER_TRAINING} == "true" ]; then
+    echo -e "Evaluating sentence model ${SUBSET} ${LANG}..."
+    ${OPENNLP_HOME}/bin/opennlp SentenceDetectorEvaluator.conllu -model 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 -data 
./ud-treebanks-v2.14/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-test.conllu
 -encoding ${ENCODING} -sentencesPerSample 10 > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.eval
+  fi
+
+  if [ ${CREATE_RELEASE} == "true" ]; then
+    echo -e "\nCreating hashes and ASC signature for sentence model ${SUBSET} 
${LANG}..."
+    sha512sum 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha512
+    sha256sum 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha256
+    gpg --default-key $GPG_PUBLIC_KEY --armor --output 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.asc
 --detach-sign 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
+  fi
+
+  # POS model
+  echo -e "\nTraining POS model ${SUBSET} ${LANG}..."
+  ${OPENNLP_HOME}/bin/opennlp POSTaggerTrainer.conllu -model 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 -data 
./ud-treebanks-v2.14/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-train.conllu
 -encoding ${ENCODING} -lang ${LANGCODE} > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.eval
 > ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VE 
[...]
+
+  if [ ${EVAL_AFTER_TRAINING} == "true" ]; then
+    echo -e "\nEvaluating POS model ${SUBSET} ${LANG}..."
+    ${OPENNLP_HOME}/bin/opennlp POSTaggerEvaluator.conllu -model 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 -data 
./ud-treebanks-v2.14/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-test.conllu
 -encoding ${ENCODING} > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.eval
+  fi
+
+  if [ ${CREATE_RELEASE} == "true" ]; then
+    echo -e "\nCreating hashes and ASC signature for POS model ${SUBSET} 
${LANG}..."
+    sha512sum 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha512
+    sha256sum 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha256
+    gpg --default-key $GPG_PUBLIC_KEY --armor --output 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.asc
 --detach-sign 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
+  fi
+
+done
+
+if [ ${CREATE_RELEASE} == "true" ]; then
+    cd ${OUTPUT_MODELS};
+    echo -e "\nCreate ZIP with eval and train logs."
+    zip 
opennlp-training-eval-logs-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.zip
 *train *.eval
+    sha512sum 
opennlp-training-eval-logs-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.zip
 > 
opennlp-training-eval-logs-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.zip.sha512
+    sha256sum 
opennlp-training-eval-logs-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.zip
 > 
opennlp-training-eval-logs-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.zip.sha256
+    gpg --default-key $GPG_PUBLIC_KEY --armor --output 
opennlp-training-eval-logs-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.zip.asc
 --detach-sign 
opennlp-training-eval-logs-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.zip
+
+    echo -e "\nRemove the path from sha512 and sha256 checksum files"
+    # Remove the path from sha512 and sha256 checksum files
+    sed -i "" "s|${OUTPUT_MODELS}/||" *.sha512
+    sed -i "" "s|${OUTPUT_MODELS}/||" *.sha256
+
+fi
\ No newline at end of file
diff --git a/opennlp-models-training/pom.xml b/opennlp-models-training/pom.xml
new file mode 100644
index 0000000..700e050
--- /dev/null
+++ b/opennlp-models-training/pom.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0";
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+    <modelVersion>4.0.0</modelVersion>
+    <parent>
+        <groupId>org.apache.opennlp</groupId>
+        <artifactId>opennlp-models</artifactId>
+        <version>1.1.1-SNAPSHOT</version>
+    </parent>
+
+    <packaging>pom</packaging>
+
+    <artifactId>opennlp-models-training</artifactId>
+    <name>Apache OpenNLP Models :: Training</name>
+
+    <modules>
+        <module>opennlp-models-training-ud</module>
+    </modules>
+
+</project>
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 32c1a06..f0d0f89 100644
--- a/pom.xml
+++ b/pom.xml
@@ -278,6 +278,7 @@
                <module>opennlp-models-pos</module>
                <module>opennlp-models-test</module>
                <module>opennlp-models-tokenizer</module>
+               <module>opennlp-models-training</module>
        </modules>
 
 </project>

Reply via email to