(opennlp-models) branch main updated: OPENNLP-1639 Add basic ud-train.conf file and reference it in ud-train.sh (#27)

mawiesne Mon, 11 Nov 2024 21:02:06 -0800

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp-models.git



The following commit(s) were added to refs/heads/main by this push:
     new 12ff4a0  OPENNLP-1639 Add basic ud-train.conf file and reference it in 
ud-train.sh (#27)
12ff4a0 is described below

commit 12ff4a0af6dfdc525083cd8b781ebccfaab79628
Author: Martin Wiesner <mawie...@users.noreply.github.com>
AuthorDate: Tue Nov 12 06:01:42 2024 +0100

    OPENNLP-1639 Add basic ud-train.conf file and reference it in ud-train.sh 
(#27)
    
    - introduces OPENNLP_CONFIG to ud-train.sh
    - adds '-params ..' for existing training CLI calls
    - adds up-train.conf with reasonable default values
    - bumps OpenNLP version to latest release (2.5.0)
---
 .../src/main/resources/ud-train.conf               | 20 ++++++++++++++++++++
 .../src/main/resources/ud-train.sh                 | 22 +++++++++++-----------
 2 files changed, 31 insertions(+), 11 deletions(-)

diff --git 
a/opennlp-models-training/opennlp-models-training-ud/src/main/resources/ud-train.conf
 
b/opennlp-models-training/opennlp-models-training-ud/src/main/resources/ud-train.conf
new file mode 100644
index 0000000..fdcc5c7
--- /dev/null
+++ 
b/opennlp-models-training/opennlp-models-training-ud/src/main/resources/ud-train.conf
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+Algorithm=MAXENT
+Iterations=150
+Cutoff=5
+Threads=4
\ No newline at end of file
diff --git 
a/opennlp-models-training/opennlp-models-training-ud/src/main/resources/ud-train.sh
 
b/opennlp-models-training/opennlp-models-training-ud/src/main/resources/ud-train.sh
index 2ed92af..5155e5b 100755
--- 
a/opennlp-models-training/opennlp-models-training-ud/src/main/resources/ud-train.sh
+++ 
b/opennlp-models-training/opennlp-models-training-ud/src/main/resources/ud-train.sh
@@ -23,17 +23,17 @@ set -e
 
 # Script configuration
 UD_HOME="./"
-OPENNLP_VERSION="opennlp-2.4.0"
-OPENNLP_VERSION_NUMERIC="2.4.0"
+OPENNLP_VERSION="opennlp-2.5.0"
+OPENNLP_VERSION_NUMERIC="2.5.0"
 OPENNLP_MODEL_VERSION="1.1"
-OPENNLP_HOME="./apache-opennlp-2.4.0"
-OUTPUT_MODELS="./ud-models-2.4.0"
-GPG_PUBLIC_KEY="" # the public key from the OPENNLP KEYS file in short form.
+OPENNLP_HOME="./apache-opennlp-2.5.0"
+OPENNLP_CONFIG="ud-train.conf" # the file to configure the number of compute 
threads and training iterations
+OUTPUT_MODELS="./ud-models-2.5.0"
+GPG_PUBLIC_KEY="" # the public key from the OPENNLP KEYS file in short form
 EVAL_AFTER_TRAINING="true"
 CREATE_RELEASE="true"
 ENCODING="UTF-8"
 
-
 # Model(s) to train
 declare -a MODELS=("English|en|EWT" "Dutch|nl|Alpino" "French|fr|GSD" 
"German|de|GSD" "Italian|it|VIT" "Bulgarian|bg|BTB" "Czech|cs|PDT" 
"Croatian|hr|SET" "Danish|da|DDT" "Estonian|et|EDT" "Finnish|fi|TDT" 
"Latvian|lv|LVTB" "Norwegian|no|Bokmaal" "Polish|pl|PDB" "Portuguese|pt|GSD" 
"Romanian|ro|RRT" "Russian|ru|GSD" "Serbian|sr|SET" "Slovenian|sl|SSJ" 
"Spanish|es|GSD" "Slovak|sk|SNK" "Swedish|sv|Talbanken" "Ukrainian|uk|IU")
 
@@ -51,7 +51,7 @@ do
 
   # Tokenizer model
   echo -e "\nTraining tokenizer model ${SUBSET} ${LANG}..."
-  ${OPENNLP_HOME}/bin/opennlp TokenizerTrainer.conllu -model 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 -lang ${LANGCODE} -data 
./ud-treebanks-v2.14/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-train.conllu
 -encoding ${ENCODING} > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.train
+  ${OPENNLP_HOME}/bin/opennlp TokenizerTrainer.conllu -params 
${UD_HOME}/${OPENNLP_CONFIG} -model 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 -lang ${LANGCODE} -data 
./ud-treebanks-v2.14/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-train.conllu
 -encoding ${ENCODING} > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.train
 
   if [ ${EVAL_AFTER_TRAINING} == "true" ]; then
     echo -e "\nEvaluating tokenizer model ${SUBSET} ${LANG}..."
@@ -67,10 +67,10 @@ do
   
   # Sentence model
   echo -e "\nTraining sentence model ${SUBSET} ${LANG}..."
-  ${OPENNLP_HOME}/bin/opennlp SentenceDetectorTrainer.conllu -model 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 -lang ${LANGCODE} -data 
./ud-treebanks-v2.14/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-train.conllu
 -encoding ${ENCODING} -sentencesPerSample 10 > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.train
+  ${OPENNLP_HOME}/bin/opennlp SentenceDetectorTrainer.conllu -params 
${UD_HOME}/${OPENNLP_CONFIG} -model 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 -lang ${LANGCODE} -data 
./ud-treebanks-v2.14/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-train.conllu
 -encoding ${ENCODING} -sentencesPerSample 10 > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.train
 
   if [ ${EVAL_AFTER_TRAINING} == "true" ]; then
-    echo -e "Evaluating sentence model ${SUBSET} ${LANG}..."
+    echo -e "\nEvaluating sentence model ${SUBSET} ${LANG}..."
     ${OPENNLP_HOME}/bin/opennlp SentenceDetectorEvaluator.conllu -model 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 -data 
./ud-treebanks-v2.14/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-test.conllu
 -encoding ${ENCODING} -sentencesPerSample 10 > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.eval
   fi
 
@@ -83,7 +83,7 @@ do
 
   # POS model
   echo -e "\nTraining POS model ${SUBSET} ${LANG}..."
-  ${OPENNLP_HOME}/bin/opennlp POSTaggerTrainer.conllu -model 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 -data 
./ud-treebanks-v2.14/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-train.conllu
 -encoding ${ENCODING} -lang ${LANGCODE} > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.eval
 > ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VE 
[...]
+  ${OPENNLP_HOME}/bin/opennlp POSTaggerTrainer.conllu -params 
${UD_HOME}/${OPENNLP_CONFIG} -model 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 -data 
./ud-treebanks-v2.14/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-train.conllu
 -encoding ${ENCODING} -lang ${LANGCODE} > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.eval
 > ${OUTPUT_MODELS}/opennlp-${LANGCODE}- [...]
 
   if [ ${EVAL_AFTER_TRAINING} == "true" ]; then
     echo -e "\nEvaluating POS model ${SUBSET} ${LANG}..."
@@ -101,7 +101,7 @@ done
 
 if [ ${CREATE_RELEASE} == "true" ]; then
     cd ${OUTPUT_MODELS};
-    echo -e "\nCreate ZIP with eval and train logs."
+    echo -e "\nCreating ZIP with eval and training logs..."
     zip 
opennlp-training-eval-logs-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.zip
 *train *.eval
     sha512sum 
opennlp-training-eval-logs-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.zip
 > 
opennlp-training-eval-logs-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.zip.sha512
     sha256sum 
opennlp-training-eval-logs-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.zip
 > 
opennlp-training-eval-logs-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.zip.sha256

(opennlp-models) branch main updated: OPENNLP-1639 Add basic ud-train.conf file and reference it in ud-train.sh (#27)

Reply via email to