This is an automated email from the ASF dual-hosted git repository.
rzo1 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp-models.git
The following commit(s) were added to refs/heads/main by this push:
new f3cea28 OPENNLP-1640 Add training and evaluation of Lemmatizer models
- adds execution phase for train & eval of lemma models for all supported
languages - removes non-required line breaks in echo messages
f3cea28 is described below
commit f3cea2868f2db079987b1282af13858ea02df25b
Author: Martin Wiesner <[email protected]>
AuthorDate: Tue Nov 12 21:50:57 2024 +0100
OPENNLP-1640 Add training and evaluation of Lemmatizer models
- adds execution phase for train & eval of lemma models for all supported
languages
- removes non-required line breaks in echo messages
---
.../src/main/resources/ud-train.sh | 41 ++++++++++++++++------
1 file changed, 30 insertions(+), 11 deletions(-)
diff --git
a/opennlp-models-training/opennlp-models-training-ud/src/main/resources/ud-train.sh
b/opennlp-models-training/opennlp-models-training-ud/src/main/resources/ud-train.sh
index 516eb14..867e250 100755
---
a/opennlp-models-training/opennlp-models-training-ud/src/main/resources/ud-train.sh
+++
b/opennlp-models-training/opennlp-models-training-ud/src/main/resources/ud-train.sh
@@ -33,7 +33,7 @@ OPENNLP_CONFIG="ud-train.conf"
# The directory a stable OpenNLP release is located in
OPENNLP_HOME="./apache-opennlp-2.5.0"
# The target version for training opennlp-models
-OPENNLP_MODEL_VERSION="1.1"
+OPENNLP_MODEL_VERSION="1.2"
# The version of OpenNLP tools to use for training
OPENNLP_VERSION_NUMERIC="2.5.0"
# The directory the resulting binary models are written to
@@ -48,6 +48,7 @@ UD_HOME="./ud-treebanks-v2.14"
TRAIN_TOKENIZER="true"
TRAIN_POSTAGGER="true"
TRAIN_SENTDETECT="true"
+TRAIN_LEMMATIZER="true"
# If 'true', each resulting model is evaluated, 'false' otherwise
EVAL_AFTER_TRAINING="true"
# If 'true, training of experimental languages will be attempted, otherwise
only stable languages & treebanks are used
@@ -65,7 +66,7 @@ mkdir -p ${OUTPUT_MODELS}
for i in "${MODELS[@]}"
do
-
+ echo -e "\n"
echo $i
LANG=`echo $i | cut -d'|' -f1`
LANGCODE=`echo $i | cut -d'|' -f2`
@@ -74,15 +75,15 @@ do
# Tokenizer model
if [ ${TRAIN_TOKENIZER} == "true" ]; then
- echo -e "\nTraining tokenizer model ${SUBSET} ${LANG}..."
+ echo -e "Training tokenizer model ${SUBSET} ${LANG}..."
${OPENNLP_HOME}/bin/opennlp TokenizerTrainer.conllu -params
${TRAIN_HOME}/${OPENNLP_CONFIG} -model
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
-lang ${LANGCODE} -data
${UD_HOME}/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-train.conllu
-encoding ${ENCODING} >
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.train
if [ ${EVAL_AFTER_TRAINING} == "true" ]; then
- echo -e "\nEvaluating tokenizer model ${SUBSET} ${LANG}..."
+ echo -e "Evaluating tokenizer model ${SUBSET} ${LANG}..."
${OPENNLP_HOME}/bin/opennlp TokenizerMEEvaluator.conllu -model
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
-data ${UD_HOME}/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-test.conllu
-encoding ${ENCODING} >
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.eval
fi
if [ ${CREATE_RELEASE} == "true" ]; then
- echo -e "\nCreating hashes and ASC signature for tokenizer model
${SUBSET} ${LANG}..."
+ echo -e "Creating hashes and ASC signature for tokenizer model ${SUBSET}
${LANG}..."
sha512sum
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
>
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha512
sha256sum
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
>
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha256
gpg --default-key $GPG_PUBLIC_KEY --armor --output
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.asc
--detach-sign
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
@@ -91,16 +92,16 @@ do
# Sentence model
if [ ${TRAIN_SENTDETECT} == "true" ]; then
- echo -e "\nTraining sentence model ${SUBSET} ${LANG}..."
+ echo -e "Training sentence model ${SUBSET} ${LANG}..."
${OPENNLP_HOME}/bin/opennlp SentenceDetectorTrainer.conllu -params
${TRAIN_HOME}/${OPENNLP_CONFIG} -model
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
-lang ${LANGCODE} -data
${UD_HOME}/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-train.conllu
-encoding ${ENCODING} -sentencesPerSample 10 >
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.train
if [ ${EVAL_AFTER_TRAINING} == "true" ]; then
- echo -e "\nEvaluating sentence model ${SUBSET} ${LANG}..."
+ echo -e "Evaluating sentence model ${SUBSET} ${LANG}..."
${OPENNLP_HOME}/bin/opennlp SentenceDetectorEvaluator.conllu -model
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
-data ${UD_HOME}/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-test.conllu
-encoding ${ENCODING} -sentencesPerSample 10 >
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.eval
fi
if [ ${CREATE_RELEASE} == "true" ]; then
- echo -e "\nCreating hashes and ASC signature for sentence model
${SUBSET} ${LANG}..."
+ echo -e "Creating hashes and ASC signature for sentence model ${SUBSET}
${LANG}..."
sha512sum
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
>
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha512
sha256sum
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
>
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha256
gpg --default-key $GPG_PUBLIC_KEY --armor --output
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.asc
--detach-sign
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
@@ -109,22 +110,40 @@ do
# POS model
if [ ${TRAIN_POSTAGGER} == "true" ]; then
- echo -e "\nTraining POS model ${SUBSET} ${LANG}..."
+ echo -e "Training POS model ${SUBSET} ${LANG}..."
${OPENNLP_HOME}/bin/opennlp POSTaggerTrainer.conllu -params
${TRAIN_HOME}/${OPENNLP_CONFIG} -model
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
-data ${UD_HOME}/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-train.conllu
-encoding ${ENCODING} -lang ${LANGCODE} >
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.train
if [ ${EVAL_AFTER_TRAINING} == "true" ]; then
- echo -e "\nEvaluating POS model ${SUBSET} ${LANG}..."
+ echo -e "Evaluating POS model ${SUBSET} ${LANG}..."
${OPENNLP_HOME}/bin/opennlp POSTaggerEvaluator.conllu -model
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
-data ${UD_HOME}/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-test.conllu
-encoding ${ENCODING} >
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.eval
fi
if [ ${CREATE_RELEASE} == "true" ]; then
- echo -e "\nCreating hashes and ASC signature for POS model ${SUBSET}
${LANG}..."
+ echo -e "Creating hashes and ASC signature for POS model ${SUBSET}
${LANG}..."
sha512sum
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
>
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha512
sha256sum
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
>
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha256
gpg --default-key $GPG_PUBLIC_KEY --armor --output
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.asc
--detach-sign
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
fi
fi
+ # Lemmatizer model
+ if [ ${TRAIN_LEMMATIZER} == "true" ]; then
+ echo -e "Training Lemmatizer model ${SUBSET} ${LANG}..."
+ ${OPENNLP_HOME}/bin/opennlp LemmatizerTrainerME.conllu -params
${TRAIN_HOME}/${OPENNLP_CONFIG} -model
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-lemmas-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
-data ${UD_HOME}/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-train.conllu
-encoding ${ENCODING} -lang ${LANGCODE} >
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-lemmas-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.train
+
+ if [ ${EVAL_AFTER_TRAINING} == "true" ]; then
+ echo -e "Evaluating Lemmatizer model ${SUBSET} ${LANG}..."
+ ${OPENNLP_HOME}/bin/opennlp LemmatizerEvaluator.conllu -model
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-lemmas-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
-data ${UD_HOME}/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-test.conllu
-encoding ${ENCODING} >
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-lemmas-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.eval
+ fi
+
+ if [ ${CREATE_RELEASE} == "true" ]; then
+ echo -e "Creating hashes and ASC signature for Lemmatizer model
${SUBSET} ${LANG}..."
+ sha512sum
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-lemmas-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
>
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-lemmas-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha512
+ sha256sum
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-lemmas-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
>
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-lemmas-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha256
+ gpg --default-key $GPG_KEY --armor --output
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.asc
--detach-sign
${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-lemmas-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin
+ fi
+ fi
+
done
# Conducts finalization steps to collect all training (and evaluation) log
files into a zip