(opennlp-models) 01/01: OPENNLP-1640 Add training and evaluation of Lemmatizer models - adds execution phase for train & eval of lemma models for all supported languages - removes non-required line breaks in echo messages

mawiesne Tue, 12 Nov 2024 12:51:28 -0800

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch 
OPENNLP-1640-Add-training-and-evaluation-of-Lemmatizer-models
in repository https://gitbox.apache.org/repos/asf/opennlp-models.git


commit d429c7a239eddea5246a4c54113df3c3b12bff6c
Author: Martin Wiesner <[email protected]>
AuthorDate: Tue Nov 12 21:50:57 2024 +0100

    OPENNLP-1640 Add training and evaluation of Lemmatizer models
    - adds execution phase for train & eval of lemma models for all supported 
languages
    - removes non-required line breaks in echo messages
---
 .../src/main/resources/ud-train.sh                 | 41 ++++++++++++++++------
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git 
a/opennlp-models-training/opennlp-models-training-ud/src/main/resources/ud-train.sh
 
b/opennlp-models-training/opennlp-models-training-ud/src/main/resources/ud-train.sh
index 516eb14..867e250 100755
--- 
a/opennlp-models-training/opennlp-models-training-ud/src/main/resources/ud-train.sh
+++ 
b/opennlp-models-training/opennlp-models-training-ud/src/main/resources/ud-train.sh
@@ -33,7 +33,7 @@ OPENNLP_CONFIG="ud-train.conf"
 # The directory a stable OpenNLP release is located in
 OPENNLP_HOME="./apache-opennlp-2.5.0"
 # The target version for training opennlp-models
-OPENNLP_MODEL_VERSION="1.1"
+OPENNLP_MODEL_VERSION="1.2"
 # The version of OpenNLP tools to use for training
 OPENNLP_VERSION_NUMERIC="2.5.0"
 # The directory the resulting binary models are written to
@@ -48,6 +48,7 @@ UD_HOME="./ud-treebanks-v2.14"
 TRAIN_TOKENIZER="true"
 TRAIN_POSTAGGER="true"
 TRAIN_SENTDETECT="true"
+TRAIN_LEMMATIZER="true"
 # If 'true', each resulting model is evaluated, 'false' otherwise
 EVAL_AFTER_TRAINING="true"
 # If 'true, training of experimental languages will be attempted, otherwise 
only stable languages & treebanks are used
@@ -65,7 +66,7 @@ mkdir -p ${OUTPUT_MODELS}
 
 for i in "${MODELS[@]}"
 do
-
+  echo -e "\n"
   echo $i
   LANG=`echo $i | cut -d'|' -f1`
   LANGCODE=`echo $i | cut -d'|' -f2`
@@ -74,15 +75,15 @@ do
 
   # Tokenizer model
   if [ ${TRAIN_TOKENIZER} == "true" ]; then
-    echo -e "\nTraining tokenizer model ${SUBSET} ${LANG}..."
+    echo -e "Training tokenizer model ${SUBSET} ${LANG}..."
     ${OPENNLP_HOME}/bin/opennlp TokenizerTrainer.conllu -params 
${TRAIN_HOME}/${OPENNLP_CONFIG} -model 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 -lang ${LANGCODE} -data 
${UD_HOME}/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-train.conllu 
-encoding ${ENCODING} > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.train
 
     if [ ${EVAL_AFTER_TRAINING} == "true" ]; then
-      echo -e "\nEvaluating tokenizer model ${SUBSET} ${LANG}..."
+      echo -e "Evaluating tokenizer model ${SUBSET} ${LANG}..."
       ${OPENNLP_HOME}/bin/opennlp TokenizerMEEvaluator.conllu -model 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 -data ${UD_HOME}/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-test.conllu 
-encoding ${ENCODING} > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.eval
     fi
     if [ ${CREATE_RELEASE} == "true" ]; then
-      echo -e "\nCreating hashes and ASC signature for tokenizer model 
${SUBSET} ${LANG}..."
+      echo -e "Creating hashes and ASC signature for tokenizer model ${SUBSET} 
${LANG}..."
       sha512sum 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha512
       sha256sum 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha256
       gpg --default-key $GPG_PUBLIC_KEY --armor --output 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.asc
 --detach-sign 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
@@ -91,16 +92,16 @@ do
 
   # Sentence model
   if [ ${TRAIN_SENTDETECT} == "true" ]; then
-    echo -e "\nTraining sentence model ${SUBSET} ${LANG}..."
+    echo -e "Training sentence model ${SUBSET} ${LANG}..."
     ${OPENNLP_HOME}/bin/opennlp SentenceDetectorTrainer.conllu -params 
${TRAIN_HOME}/${OPENNLP_CONFIG} -model 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 -lang ${LANGCODE} -data 
${UD_HOME}/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-train.conllu 
-encoding ${ENCODING} -sentencesPerSample 10 > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.train
 
     if [ ${EVAL_AFTER_TRAINING} == "true" ]; then
-      echo -e "\nEvaluating sentence model ${SUBSET} ${LANG}..."
+      echo -e "Evaluating sentence model ${SUBSET} ${LANG}..."
       ${OPENNLP_HOME}/bin/opennlp SentenceDetectorEvaluator.conllu -model 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 -data ${UD_HOME}/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-test.conllu 
-encoding ${ENCODING} -sentencesPerSample 10 > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.eval
     fi
 
     if [ ${CREATE_RELEASE} == "true" ]; then
-      echo -e "\nCreating hashes and ASC signature for sentence model 
${SUBSET} ${LANG}..."
+      echo -e "Creating hashes and ASC signature for sentence model ${SUBSET} 
${LANG}..."
       sha512sum 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha512
       sha256sum 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha256
       gpg --default-key $GPG_PUBLIC_KEY --armor --output 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.asc
 --detach-sign 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
@@ -109,22 +110,40 @@ do
 
   # POS model
   if [ ${TRAIN_POSTAGGER} == "true" ]; then
-    echo -e "\nTraining POS model ${SUBSET} ${LANG}..."
+    echo -e "Training POS model ${SUBSET} ${LANG}..."
     ${OPENNLP_HOME}/bin/opennlp POSTaggerTrainer.conllu -params 
${TRAIN_HOME}/${OPENNLP_CONFIG} -model 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 -data ${UD_HOME}/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-train.conllu 
-encoding ${ENCODING} -lang ${LANGCODE} > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.train
 
     if [ ${EVAL_AFTER_TRAINING} == "true" ]; then
-      echo -e "\nEvaluating POS model ${SUBSET} ${LANG}..."
+      echo -e "Evaluating POS model ${SUBSET} ${LANG}..."
       ${OPENNLP_HOME}/bin/opennlp POSTaggerEvaluator.conllu -model 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 -data ${UD_HOME}/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-test.conllu 
-encoding ${ENCODING} > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.eval
     fi
 
     if [ ${CREATE_RELEASE} == "true" ]; then
-      echo -e "\nCreating hashes and ASC signature for POS model ${SUBSET} 
${LANG}..."
+      echo -e "Creating hashes and ASC signature for POS model ${SUBSET} 
${LANG}..."
       sha512sum 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha512
       sha256sum 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha256
       gpg --default-key $GPG_PUBLIC_KEY --armor --output 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.asc
 --detach-sign 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
     fi
   fi
 
+  # Lemmatizer model
+  if [ ${TRAIN_LEMMATIZER} == "true" ]; then
+    echo -e "Training Lemmatizer model ${SUBSET} ${LANG}..."
+    ${OPENNLP_HOME}/bin/opennlp LemmatizerTrainerME.conllu -params 
${TRAIN_HOME}/${OPENNLP_CONFIG} -model 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-lemmas-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 -data ${UD_HOME}/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-train.conllu 
-encoding ${ENCODING} -lang ${LANGCODE} > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-lemmas-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.train
+
+    if [ ${EVAL_AFTER_TRAINING} == "true" ]; then
+      echo -e "Evaluating Lemmatizer model ${SUBSET} ${LANG}..."
+      ${OPENNLP_HOME}/bin/opennlp LemmatizerEvaluator.conllu -model 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-lemmas-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 -data ${UD_HOME}/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-test.conllu 
-encoding ${ENCODING} > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-lemmas-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.eval
+    fi
+
+    if [ ${CREATE_RELEASE} == "true" ]; then
+      echo -e "Creating hashes and ASC signature for Lemmatizer model 
${SUBSET} ${LANG}..."
+      sha512sum 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-lemmas-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-lemmas-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha512
+      sha256sum 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-lemmas-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
 > 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-lemmas-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha256
+      gpg --default-key $GPG_KEY --armor --output 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.asc
 --detach-sign 
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-lemmas-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
+    fi
+  fi
+
 done
 
 # Conducts finalization steps to collect all training (and evaluation) log 
files into a zip

(opennlp-models) 01/01: OPENNLP-1640 Add training and evaluation of Lemmatizer models - adds execution phase for train & eval of lemma models for all supported languages - removes non-required line breaks in echo messages

Reply via email to