OPENNLP-966: Remove deprecated UIMA trainers
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/f0020c40 Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/f0020c40 Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/f0020c40 Branch: refs/heads/master Commit: f0020c407098873fbd5a369de4863e6b9adc592b Parents: 15939e9 Author: Jörn Kottmann <jo...@apache.org> Authored: Thu Jan 26 23:05:44 2017 +0100 Committer: Jörn Kottmann <jo...@apache.org> Committed: Fri Jan 27 13:58:02 2017 +0100 ---------------------------------------------------------------------- opennlp-uima/descriptors/ChunkerTrainer.xml | 143 ------ .../descriptors/PersonNameFinderTrainer.xml | 168 ------- opennlp-uima/descriptors/PosTaggerTrainer.xml | 116 ----- .../descriptors/SentenceDetectorTrainer.xml | 106 ----- opennlp-uima/descriptors/TokenizerTrainer.xml | 124 ----- .../opennlp/uima/chunker/ChunkerTrainer.java | 236 ---------- .../uima/doccat/DocumentCategorizerTrainer.java | 162 ------- .../uima/namefind/NameFinderTrainer.java | 447 ------------------- .../opennlp/uima/postag/POSTaggerTrainer.java | 240 ---------- .../sentdetect/SentenceDetectorTrainer.java | 210 --------- .../opennlp/uima/tokenize/TokenizerTrainer.java | 294 ------------ .../java/opennlp/uima/util/CasConsumerUtil.java | 411 ----------------- .../opennlp/uima/util/SampleTraceStream.java | 65 --- 13 files changed, 2722 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/f0020c40/opennlp-uima/descriptors/ChunkerTrainer.xml ---------------------------------------------------------------------- diff --git a/opennlp-uima/descriptors/ChunkerTrainer.xml b/opennlp-uima/descriptors/ChunkerTrainer.xml deleted file mode 100644 index fce9599..0000000 --- a/opennlp-uima/descriptors/ChunkerTrainer.xml +++ /dev/null @@ -1,143 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> - -<casConsumerDescription xmlns="http://uima.apache.org/resourceSpecifier"> - <frameworkImplementation>org.apache.uima.java</frameworkImplementation> - <implementationName>opennlp.uima.chunker.ChunkerTrainer</implementationName> - <processingResourceMetaData> - <name>POS Trainer</name> - <description></description> - <version>${pom.version}</version> - <vendor>Apache Software Foundation</vendor> - <configurationParameters> - - <configurationParameter> - <name>opennlp.uima.ModelName</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.SentenceType</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.TokenType</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.POSFeature</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.Language</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.ChunkType</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.ChunkTagFeature</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - </configurationParameters> - - <configurationParameterSettings> - <nameValuePair> - <name>opennlp.uima.ModelName</name> - <value> - <string>POS.bin</string> - </value> - </nameValuePair> - - <nameValuePair> - <name>opennlp.uima.TokenType</name> - <value> - <string>opennlp.uima.Token</string> - </value> - </nameValuePair> - - <nameValuePair> - <name>opennlp.uima.SentenceType</name> - <value> - <string>opennlp.uima.Sentence</string> - </value> - </nameValuePair> - - <nameValuePair> - <name>opennlp.uima.POSFeature</name> - <value> - <string>pos</string> - </value> - </nameValuePair> - - <nameValuePair> - <name>opennlp.uima.Language</name> - <value> - <string>en</string> - </value> - </nameValuePair> - - <nameValuePair> - <name>opennlp.uima.ChunkType</name> - <value> - <string>opennlp.uima.Chunk</string> - </value> - </nameValuePair> - - <nameValuePair> - <name>opennlp.uima.ChunkTagFeature</name> - <value> - <string>chunkType</string> - </value> - </nameValuePair> - - </configurationParameterSettings> - - <typeSystemDescription /> - <typePriorities /> - <fsIndexCollection /> - <capabilities /> - <operationalProperties> - <modifiesCas>false</modifiesCas> - <multipleDeploymentAllowed>false</multipleDeploymentAllowed> - </operationalProperties> - </processingResourceMetaData> -</casConsumerDescription> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp/blob/f0020c40/opennlp-uima/descriptors/PersonNameFinderTrainer.xml ---------------------------------------------------------------------- diff --git a/opennlp-uima/descriptors/PersonNameFinderTrainer.xml b/opennlp-uima/descriptors/PersonNameFinderTrainer.xml deleted file mode 100644 index a7f1f8c..0000000 --- a/opennlp-uima/descriptors/PersonNameFinderTrainer.xml +++ /dev/null @@ -1,168 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> - -<casConsumerDescription xmlns="http://uima.apache.org/resourceSpecifier"> - <frameworkImplementation>org.apache.uima.java - </frameworkImplementation> - <implementationName>opennlp.uima.namefind.NameFinderTrainer</implementationName> - <processingResourceMetaData> - <name>Person Name Finder Trainer</name> - <description></description> - <version>${pom.version}</version> - <vendor>Apache Software Foundation</vendor> - <configurationParameters> - <configurationParameter> - <name>opennlp.uima.ModelName</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.SentenceType</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.TokenType</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.NameType</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.TrainingParamsFile</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>false</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.AdditionalTrainingDataFile</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>false</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.AdditionalTrainingDataEncoding</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>false</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.SampleTraceFile</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>false</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.SampleTraceFileEncoding</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>false</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.FeatureGeneratorFile</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>false</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.FeatureGeneratorResources</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>false</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.Language</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - </configurationParameters> - - <configurationParameterSettings> - - <nameValuePair> - <name>opennlp.uima.ModelName</name> - <value> - <string>Person.bin</string> - </value> - </nameValuePair> - - <nameValuePair> - <name>opennlp.uima.TokenType</name> - <value> - <string>opennlp.uima.Token</string> - </value> - </nameValuePair> - - <nameValuePair> - <name>opennlp.uima.SentenceType</name> - <value> - <string>uima.tcas.DocumentAnnotation - </string> - </value> - </nameValuePair> - <nameValuePair> - <name>opennlp.uima.NameType</name> - <value> - <string>opennlp.uima.Person</string> - </value> - </nameValuePair> - - <nameValuePair> - <name>opennlp.uima.Language</name> - <value> - <string>en</string> - </value> - </nameValuePair> - - </configurationParameterSettings> - <typeSystemDescription> - <imports> - <import location="TypeSystem.xml" /> - </imports> - </typeSystemDescription> - <typePriorities /> - <fsIndexCollection /> - <capabilities /> - <operationalProperties> - <modifiesCas>false</modifiesCas> - <multipleDeploymentAllowed>false</multipleDeploymentAllowed> - </operationalProperties> - </processingResourceMetaData> -</casConsumerDescription> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp/blob/f0020c40/opennlp-uima/descriptors/PosTaggerTrainer.xml ---------------------------------------------------------------------- diff --git a/opennlp-uima/descriptors/PosTaggerTrainer.xml b/opennlp-uima/descriptors/PosTaggerTrainer.xml deleted file mode 100644 index 325c76e..0000000 --- a/opennlp-uima/descriptors/PosTaggerTrainer.xml +++ /dev/null @@ -1,116 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> - -<casConsumerDescription xmlns="http://uima.apache.org/resourceSpecifier"> - <frameworkImplementation>org.apache.uima.java</frameworkImplementation> - <implementationName>opennlp.uima.postag.POSTaggerTrainer</implementationName> - <processingResourceMetaData> - <name>POS Trainer</name> - <description></description> - <version>${pom.version}</version> - <vendor>Apache Software Foundation</vendor> - <configurationParameters> - - <configurationParameter> - <name>opennlp.uima.ModelName</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.SentenceType</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.TokenType</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.POSFeature</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.Language</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - - </configurationParameters> - - <configurationParameterSettings> - <nameValuePair> - <name>opennlp.uima.ModelName</name> - <value> - <string>POS.bin</string> - </value> - </nameValuePair> - - <nameValuePair> - <name>opennlp.uima.TokenType</name> - <value> - <string>opennlp.uima.Token</string> - </value> - </nameValuePair> - - <nameValuePair> - <name>opennlp.uima.SentenceType</name> - <value> - <string>opennlp.uima.Sentence</string> - </value> - </nameValuePair> - - <nameValuePair> - <name>opennlp.uima.POSFeature</name> - <value> - <string>pos</string> - </value> - </nameValuePair> - - <nameValuePair> - <name>opennlp.uima.Language</name> - <value> - <string>en</string> - </value> - </nameValuePair> - - </configurationParameterSettings> - - <typeSystemDescription /> - <typePriorities /> - <fsIndexCollection /> - <capabilities /> - <operationalProperties> - <modifiesCas>false</modifiesCas> - <multipleDeploymentAllowed>false</multipleDeploymentAllowed> - </operationalProperties> - </processingResourceMetaData> -</casConsumerDescription> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp/blob/f0020c40/opennlp-uima/descriptors/SentenceDetectorTrainer.xml ---------------------------------------------------------------------- diff --git a/opennlp-uima/descriptors/SentenceDetectorTrainer.xml b/opennlp-uima/descriptors/SentenceDetectorTrainer.xml deleted file mode 100644 index 1db008f..0000000 --- a/opennlp-uima/descriptors/SentenceDetectorTrainer.xml +++ /dev/null @@ -1,106 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> - -<casConsumerDescription xmlns="http://uima.apache.org/resourceSpecifier"> - <frameworkImplementation>org.apache.uima.java - </frameworkImplementation> - <implementationName>opennlp.uima.sentdetect.SentenceDetectorTrainer</implementationName> - <processingResourceMetaData> - <name>Sentence Detector Trainer</name> - <description></description> - <version>${pom.version}</version> - <vendor>Apache Software Foundation</vendor> - - <configurationParameters> - <configurationParameter> - <name>opennlp.uima.ModelName</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.SentenceType</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.Language</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - - <configurationParameter> - <name>opennlp.uima.EOSChars</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>false</mandatory> - </configurationParameter> - <configurationParameter> - <name>opennlp.uima.SampleTraceFile</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>false</mandatory> - </configurationParameter> - <configurationParameter> - <name>opennlp.uima.SampleTraceFileEncoding</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>false</mandatory> - </configurationParameter> - </configurationParameters> - - <configurationParameterSettings> - <nameValuePair> - <name>opennlp.uima.ModelName</name> - <value> - <string>SentDetect.bin</string> - </value> - </nameValuePair> - - <nameValuePair> - <name>opennlp.uima.SentenceType</name> - <value> - <string>opennlp.uima.Sentence</string> - </value> - </nameValuePair> - - <nameValuePair> - <name>opennlp.uima.Language</name> - <value> - <string>en</string> - </value> - </nameValuePair> - - </configurationParameterSettings> - <typeSystemDescription /> - <typePriorities /> - <fsIndexCollection /> - <capabilities /> - <operationalProperties> - <modifiesCas>false</modifiesCas> - <multipleDeploymentAllowed>false</multipleDeploymentAllowed> - </operationalProperties> - </processingResourceMetaData> -</casConsumerDescription> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp/blob/f0020c40/opennlp-uima/descriptors/TokenizerTrainer.xml ---------------------------------------------------------------------- diff --git a/opennlp-uima/descriptors/TokenizerTrainer.xml b/opennlp-uima/descriptors/TokenizerTrainer.xml deleted file mode 100644 index 654f3df..0000000 --- a/opennlp-uima/descriptors/TokenizerTrainer.xml +++ /dev/null @@ -1,124 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> - -<casConsumerDescription xmlns="http://uima.apache.org/resourceSpecifier"> - <frameworkImplementation>org.apache.uima.java - </frameworkImplementation> - <implementationName>opennlp.uima.tokenize.TokenizerTrainer - </implementationName> - <processingResourceMetaData> - <name>TokenizerTrainer</name> - <description></description> - <version>${pom.version}</version> - <vendor>Apache Software Foundation</vendor> - <configurationParameters> - <configurationParameter> - <name>opennlp.uima.ModelName</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - <configurationParameter> - <name>opennlp.uima.SentenceType</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - <configurationParameter> - <name>opennlp.uima.TokenType</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - <configurationParameter> - <name>opennlp.uima.tokenizer.IsSkipAlphaNumerics - </name> - <type>Boolean</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - <configurationParameter> - <name>opennlp.uima.Language</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>true</mandatory> - </configurationParameter> - <configurationParameter> - <name>opennlp.uima.SampleTraceFile</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>false</mandatory> - </configurationParameter> - <configurationParameter> - <name>opennlp.uima.SampleTraceFileEncoding</name> - <type>String</type> - <multiValued>false</multiValued> - <mandatory>false</mandatory> - </configurationParameter> - </configurationParameters> - <configurationParameterSettings> - <nameValuePair> - <name>opennlp.uima.ModelName</name> - <value> - <string>Tokens.bin</string> - </value> - </nameValuePair> - <nameValuePair> - <name>opennlp.uima.TokenType</name> - <value> - <string>opennlp.uima.Token</string> - </value> - </nameValuePair> - <nameValuePair> - <name>opennlp.uima.SentenceType</name> - <value> - <string>uima.tcas.DocumentAnnotation - </string> - </value> - </nameValuePair> - <nameValuePair> - <name>opennlp.uima.tokenizer.IsSkipAlphaNumerics - </name> - <value> - <boolean>false</boolean> - </value> - </nameValuePair> - <nameValuePair> - <name>opennlp.uima.Language</name> - <value> - <string>en</string> - </value> - </nameValuePair> - </configurationParameterSettings> - <typeSystemDescription> - <imports> - <import location="TypeSystem.xml" /> - </imports> - </typeSystemDescription> - <typePriorities /> - <fsIndexCollection /> - <capabilities /> - <operationalProperties> - <modifiesCas>false</modifiesCas> - <multipleDeploymentAllowed>false</multipleDeploymentAllowed> - </operationalProperties> - </processingResourceMetaData> -</casConsumerDescription> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp/blob/f0020c40/opennlp-uima/src/main/java/opennlp/uima/chunker/ChunkerTrainer.java ---------------------------------------------------------------------- diff --git a/opennlp-uima/src/main/java/opennlp/uima/chunker/ChunkerTrainer.java b/opennlp-uima/src/main/java/opennlp/uima/chunker/ChunkerTrainer.java deleted file mode 100644 index 30b3f2f..0000000 --- a/opennlp-uima/src/main/java/opennlp/uima/chunker/ChunkerTrainer.java +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.uima.chunker; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import org.apache.uima.UimaContext; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.FSIndex; -import org.apache.uima.cas.Feature; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.TypeSystem; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.collection.CasConsumer_ImplBase; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.ResourceProcessException; -import org.apache.uima.util.Level; -import org.apache.uima.util.Logger; -import org.apache.uima.util.ProcessTrace; - -import opennlp.tools.chunker.ChunkSample; -import opennlp.tools.chunker.ChunkerFactory; -import opennlp.tools.chunker.ChunkerME; -import opennlp.tools.chunker.ChunkerModel; -import opennlp.tools.ml.maxent.GIS; -import opennlp.tools.util.ObjectStreamUtils; -import opennlp.tools.util.model.ModelUtil; -import opennlp.uima.util.CasConsumerUtil; -import opennlp.uima.util.ContainingConstraint; -import opennlp.uima.util.OpennlpUtil; -import opennlp.uima.util.UimaUtil; - -/** - * OpenNLP Chunker trainer. - * <p> - * Mandatory parameters - * <table border=1> - * <caption></caption> - * <tr><th>Type</th> <th>Name</th> <th>Description</th></tr> - * <tr><td>String</td> <td>opennlp.uima.ModelName</td> <td>The name of the model file</td></tr> - * <tr><td>String</td> <td>opennlp.uima.SentenceType</td> <td>The full name of the sentence type</td></tr> - * <tr><td>String</td> <td>opennlp.uima.TokenType</td> <td>The full name of the token type</td></tr> - * <tr><td>String</td> <td>opennlp.uima.POSFeature</td></tr> - * <tr><td>String</td> <td>opennlp.uima.ChunkType</td></tr> - * <tr><td>String</td> <td>opennlp.uima.ChunkTagFeature</td></tr> - * </table> - * - * @deprecated will be removed after 1.7.1 release, there is no replacement - */ -@Deprecated -public class ChunkerTrainer extends CasConsumer_ImplBase { - - private List<ChunkSample> mChunkSamples = new ArrayList<>(); - - private UimaContext mContext; - - private String mModelName; - - private Type mSentenceType; - - private Type mTokenType; - - private Feature mPOSFeature; - - private Type mChunkType; - - private Feature mChunkTagFeature; - - private String language; - - /** - * Initializes the current instance. - */ - public void initialize() throws ResourceInitializationException { - - super.initialize(); - - mContext = getUimaContext(); - - Logger mLogger = mContext.getLogger(); - - if (mLogger.isLoggable(Level.INFO)) { - mLogger.log(Level.INFO, "Initializing the OpenNLP Chunker Trainer."); - } - - mModelName = CasConsumerUtil.getRequiredStringParameter(mContext, - UimaUtil.MODEL_PARAMETER); - - language = CasConsumerUtil.getRequiredStringParameter(mContext, - UimaUtil.LANGUAGE_PARAMETER); - } - - /** - * Initialize the current instance with the given type system. - */ - public void typeSystemInit(TypeSystem typeSystem) - throws ResourceInitializationException { - String sentenceTypeName = - CasConsumerUtil.getRequiredStringParameter(mContext, - UimaUtil.SENTENCE_TYPE_PARAMETER); - - mSentenceType = CasConsumerUtil.getType(typeSystem, sentenceTypeName); - - String chunkTypeName = CasConsumerUtil.getRequiredStringParameter(mContext, - Chunker.CHUNK_TYPE_PARAMETER); - - mChunkType = CasConsumerUtil.getType(typeSystem, chunkTypeName); - - String chunkTagFeature = CasConsumerUtil.getRequiredStringParameter( - mContext, Chunker.CHUNK_TAG_FEATURE_PARAMETER); - - mChunkTagFeature = mChunkType.getFeatureByBaseName(chunkTagFeature); - - CasConsumerUtil.checkFeatureType(mChunkTagFeature, CAS.TYPE_NAME_STRING); - - String tokenTypeName = CasConsumerUtil.getRequiredStringParameter(mContext, - UimaUtil.TOKEN_TYPE_PARAMETER); - - mTokenType = CasConsumerUtil.getType(typeSystem, tokenTypeName); - - String posFeatureName = CasConsumerUtil.getRequiredStringParameter(mContext, - UimaUtil.POS_FEATURE_PARAMETER); - - mPOSFeature = mTokenType.getFeatureByBaseName(posFeatureName); - - CasConsumerUtil.checkFeatureType(mPOSFeature, CAS.TYPE_NAME_STRING); - } - - /** - * Process the given CAS object. - */ - public void processCas(CAS cas) { - - FSIndex<AnnotationFS> sentenceIndex = cas.getAnnotationIndex(mSentenceType); - - for (AnnotationFS sentenceAnnotation : sentenceIndex) { - processSentence(cas, sentenceAnnotation); - } - } - - private void processSentence(CAS tcas, AnnotationFS sentence) { - FSIndex<AnnotationFS> chunkIndex = tcas.getAnnotationIndex(mChunkType); - - ContainingConstraint containingConstraint = - new ContainingConstraint(sentence); - - Iterator<AnnotationFS> chunkIterator = tcas.createFilteredIterator( - chunkIndex.iterator(), containingConstraint); - - while (chunkIterator.hasNext()) { - AnnotationFS chunkAnnotation = chunkIterator.next(); - processChunk(tcas, chunkAnnotation); - } - } - - private void processChunk(CAS tcas, AnnotationFS chunk) { - - String chunkTag = chunk.getFeatureValueAsString(mChunkTagFeature); - - FSIndex<AnnotationFS> tokenIndex = tcas.getAnnotationIndex(mTokenType); - - ContainingConstraint containingConstraint = - new ContainingConstraint(chunk); - - Iterator<AnnotationFS> tokenIterator = tcas.createFilteredIterator(tokenIndex.iterator(), - containingConstraint); - - List<String> tokens = new ArrayList<>(); - List<String> tags = new ArrayList<>(); - List<String> chunkTags = new ArrayList<>(); - - while (tokenIterator.hasNext()) { - AnnotationFS tokenAnnotation = tokenIterator.next(); - - tokens.add(tokenAnnotation.getCoveredText().trim()); - tags.add(tokenAnnotation.getFeatureValueAsString(mPOSFeature)); - chunkTags.add(chunkTag); - } - - mChunkSamples.add(new ChunkSample(tokens, tags, chunkTags)); - } - - /** - * Called if the processing is finished, this method - * does the training. - */ - public void collectionProcessComplete(ProcessTrace trace) - throws ResourceProcessException, IOException { - GIS.PRINT_MESSAGES = false; - - ChunkerModel chunkerModel = ChunkerME.train(language, - ObjectStreamUtils.createObjectStream(mChunkSamples), - ModelUtil.createDefaultTrainingParameters(), ChunkerFactory.create(null)); - - // dereference to allow garbage collection - mChunkSamples = null; - - File modelFile = new File(getUimaContextAdmin().getResourceManager() - .getDataPath() + File.separatorChar + mModelName); - - OpennlpUtil.serialize(chunkerModel, modelFile); - } - - /** - * The trainer is not stateless. - */ - public boolean isStateless() { - return false; - } - - /** - * Releases allocated resources. - */ - public void destroy() { - mChunkSamples = null; - } -} http://git-wip-us.apache.org/repos/asf/opennlp/blob/f0020c40/opennlp-uima/src/main/java/opennlp/uima/doccat/DocumentCategorizerTrainer.java ---------------------------------------------------------------------- diff --git a/opennlp-uima/src/main/java/opennlp/uima/doccat/DocumentCategorizerTrainer.java b/opennlp-uima/src/main/java/opennlp/uima/doccat/DocumentCategorizerTrainer.java deleted file mode 100644 index ca788d7..0000000 --- a/opennlp-uima/src/main/java/opennlp/uima/doccat/DocumentCategorizerTrainer.java +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.uima.doccat; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import org.apache.uima.UimaContext; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.FSIndex; -import org.apache.uima.cas.Feature; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.TypeSystem; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.collection.CasConsumer_ImplBase; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.ResourceProcessException; -import org.apache.uima.util.Level; -import org.apache.uima.util.Logger; -import org.apache.uima.util.ProcessTrace; - -import opennlp.tools.doccat.DoccatFactory; -import opennlp.tools.doccat.DoccatModel; -import opennlp.tools.doccat.DocumentCategorizerME; -import opennlp.tools.doccat.DocumentSample; -import opennlp.tools.ml.maxent.GIS; -import opennlp.tools.util.ObjectStreamUtils; -import opennlp.tools.util.TrainingParameters; -import opennlp.uima.util.CasConsumerUtil; -import opennlp.uima.util.OpennlpUtil; -import opennlp.uima.util.UimaUtil; - -/** - * OpenNLP NameFinder trainer. - * <p> - * Note: This class is still work in progress, and should not be used! - * - * @deprecated will be removed after 1.7.1 release, there is no replacement - */ -@Deprecated - -public class DocumentCategorizerTrainer extends CasConsumer_ImplBase { - - private UimaContext mContext; - - private String mModelName; - - private List<DocumentSample> documentSamples = new ArrayList<>(); - - private Type mCategoryType; - - private Feature mCategoryFeature; - - private String language; - - public void initialize() throws ResourceInitializationException { - - super.initialize(); - - mContext = getUimaContext(); - - Logger mLogger = mContext.getLogger(); - - if (mLogger.isLoggable(Level.INFO)) { - mLogger.log(Level.INFO, "Initializing the OpenNLP Doccat Trainer."); - } - - mModelName = CasConsumerUtil.getRequiredStringParameter(mContext, - UimaUtil.MODEL_PARAMETER); - - language = CasConsumerUtil.getRequiredStringParameter(mContext, - UimaUtil.LANGUAGE_PARAMETER); - } - - public void typeSystemInit(TypeSystem typeSystem) - throws ResourceInitializationException { - - String tokenTypeName = CasConsumerUtil.getRequiredStringParameter(mContext, - UimaUtil.SENTENCE_TYPE_PARAMETER); - - Type mTokenType = CasConsumerUtil.getType(typeSystem, tokenTypeName); - - String categoryTypeName = CasConsumerUtil.getRequiredStringParameter(mContext, - "opennlp.uima.doccat.CategoryType"); - - mCategoryType = CasConsumerUtil.getType(typeSystem, categoryTypeName); - - // get feature name - String categoryFeatureName = CasConsumerUtil.getRequiredStringParameter(mContext, - "opennlp.uima.doccat.CategoryFeature"); - - mCategoryFeature = mCategoryType.getFeatureByBaseName(categoryFeatureName); - } - - public void processCas(CAS cas) throws ResourceProcessException { - - FSIndex categoryIndex = cas.getAnnotationIndex(mCategoryType); - - if (categoryIndex.size() > 0) { - AnnotationFS categoryAnnotation = - (AnnotationFS) categoryIndex.iterator().next(); - - // add to event collection - - DocumentSample sample = new DocumentSample( - categoryAnnotation.getStringValue(mCategoryFeature), - cas.getDocumentText()); - - documentSamples.add(sample); - } - } - - public void collectionProcessComplete(ProcessTrace trace) - throws ResourceProcessException, IOException { - - GIS.PRINT_MESSAGES = false; - - TrainingParameters params = new TrainingParameters(); - params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100)); - params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(0)); - - DoccatModel categoryModel = DocumentCategorizerME.train(language, - ObjectStreamUtils.createObjectStream(documentSamples), params, new DoccatFactory()); - - File modelFile = new File(getUimaContextAdmin().getResourceManager() - .getDataPath() + File.separatorChar + mModelName); - - OpennlpUtil.serialize(categoryModel, modelFile); - } - - /** - * The trainer is not stateless. - */ - public boolean isStateless() { - return false; - } - - /** - * Destroys the current instance. - */ - public void destroy() { - // dereference to allow garbage collection - documentSamples = null; - } -} http://git-wip-us.apache.org/repos/asf/opennlp/blob/f0020c40/opennlp-uima/src/main/java/opennlp/uima/namefind/NameFinderTrainer.java ---------------------------------------------------------------------- diff --git a/opennlp-uima/src/main/java/opennlp/uima/namefind/NameFinderTrainer.java b/opennlp-uima/src/main/java/opennlp/uima/namefind/NameFinderTrainer.java deleted file mode 100644 index af00f58..0000000 --- a/opennlp-uima/src/main/java/opennlp/uima/namefind/NameFinderTrainer.java +++ /dev/null @@ -1,447 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.uima.namefind; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.io.Writer; -import java.nio.charset.Charset; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; - -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.FSIndex; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.TypeSystem; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.collection.CasConsumer_ImplBase; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.ResourceProcessException; -import org.apache.uima.util.Level; -import org.apache.uima.util.Logger; -import org.apache.uima.util.ProcessTrace; - -import opennlp.tools.cmdline.namefind.TokenNameFinderTrainerTool; -import opennlp.tools.ml.maxent.GIS; -import opennlp.tools.namefind.BioCodec; -import opennlp.tools.namefind.NameFinderME; -import opennlp.tools.namefind.NameSample; -import opennlp.tools.namefind.NameSampleDataStream; -import opennlp.tools.namefind.TokenNameFinderFactory; -import opennlp.tools.namefind.TokenNameFinderModel; -import opennlp.tools.util.InputStreamFactory; -import opennlp.tools.util.MarkableFileInputStreamFactory; -import opennlp.tools.util.ObjectStream; -import opennlp.tools.util.ObjectStreamUtils; -import opennlp.tools.util.PlainTextByLineStream; -import opennlp.tools.util.Span; -import opennlp.tools.util.TrainingParameters; -import opennlp.uima.util.CasConsumerUtil; -import opennlp.uima.util.ContainingConstraint; -import opennlp.uima.util.OpennlpUtil; -import opennlp.uima.util.SampleTraceStream; -import opennlp.uima.util.UimaUtil; - -/** - * OpenNLP NameFinder trainer. - * <p> - * Mandatory parameters - * <table border=1> - * <caption></caption> - * <tr><th>Type</th> <th>Name</th> <th>Description</th></tr> - * <tr><td>String</td> <td>opennlp.uima.ModelName</td> <td>The name of the model file</td></tr> - * <tr><td>String</td> <td>opennlp.uima.Language</td> <td>The language code</td></tr> - * <tr><td>String</td> <td>opennlp.uima.SentenceType</td> <td>The full name of the sentence type</td></tr> - * <tr><td>String</td> <td>opennlp.uima.TokenType</td> <td>The full name of the token type</td></tr> - * <tr><td>String</td> <td>opennlp.uima.NameType</td> <td>The full name of the name type</td></tr> - * </table> - * - * Optional parameters - * <table border=1> - * <caption></caption> - * <tr><th>Type</th> <th>Name</th> <th>Description</th></tr> - * <tr><td>String</td> <td>opennlp.uima.opennlp.uima.TrainingParamsFile</td> - * <td>Training Parameters Properties file</td></tr> - * <tr><td>String</td> <td>opennlp.uima.FeatureGeneratorFile</td> - * <td>Feature Generator definition file which contain the feature generator configuration</td></tr> - * <tr><td>String</td> <td>opennlp.uima.FeatureGeneratorResources</td> - * <td>Feature Generator resources dictionary</td></tr> - * <tr><td>String</td> <td>opennlp.uima.AdditionalTrainingDataFile</td> - * <td>Training file which contains additional data in the OpenNLP format</td></tr> - * <tr><td>String</td> <td>opennlp.uima.AdditionalTrainingDataEncoding</td> - * <td>Encoding of the additional training data</td></tr> - * <tr><td>String</td> <td>opennlp.uima.SampleTraceFile</td> - * <td>All training samples are traced to this file</td></tr> - * <tr><td>String</td> <td>opennlp.uima.SampleTraceFileEncoding</td> - * <td>Encoding of the sample trace file</td></tr> - * </table> - * <p> - * - * @deprecated will be removed after 1.7.1 release, there is no replacement - */ -@Deprecated - -public final class NameFinderTrainer extends CasConsumer_ImplBase { - - private static final String FEATURE_GENERATOR_DEFINITION_FILE_PARAMETER = - "opennlp.uima.FeatureGeneratorFile"; - private static final String FEATURE_GENERATOR_RESOURCES_PARAMETER = - "opennlp.uima.FeatureGeneratorResources"; - - private Logger logger; - - private String modelPath; - - private byte featureGeneratorDefinition[]; - - private File featureGeneratorResourceDir; - - private String additionalTrainingDataFile; - - private String additionalTrainingDataEncoding; - - private File sampleTraceFile = null; - - private String sampleTraceFileEncoding = null; - - private Type sentenceType; - - private Type tokenType; - - private Type nameType; - - private String language; - - // TODO: Keeping all events in memory limits the size of the training corpus - // Possible solutions: - // - Write all events to disk - // - Directly start indexing with a blocking sample stream, the indexer will then write everything - // to disk or could store the events much more space efficient in memory - - private List<NameSample> nameFinderSamples = new ArrayList<>(); - private TrainingParameters trainingParams; - - /** - * Initializes the current instance. - */ - public void initialize() throws ResourceInitializationException { - - super.initialize(); - - logger = getUimaContext().getLogger(); - - if (logger.isLoggable(Level.INFO)) { - logger.log(Level.INFO, "Initializing the OpenNLP Name Trainer."); - } - - modelPath = CasConsumerUtil.getRequiredStringParameter(getUimaContext(), - UimaUtil.MODEL_PARAMETER); - - language = CasConsumerUtil.getRequiredStringParameter(getUimaContext(), - UimaUtil.LANGUAGE_PARAMETER); - - trainingParams = OpennlpUtil.loadTrainingParams(CasConsumerUtil.getOptionalStringParameter( - getUimaContext(), UimaUtil.TRAINING_PARAMS_FILE_PARAMETER), true); - - String featureGeneratorDefinitionFile = CasConsumerUtil.getOptionalStringParameter( - getUimaContext(), FEATURE_GENERATOR_DEFINITION_FILE_PARAMETER); - - if (featureGeneratorDefinitionFile != null) { - try { - featureGeneratorDefinition = OpennlpUtil.loadBytes(new File(featureGeneratorDefinitionFile)); - } catch (IOException e) { - throw new ResourceInitializationException(e); - } - - String featureGeneratorResourcesDirName = CasConsumerUtil.getOptionalStringParameter( - getUimaContext(), FEATURE_GENERATOR_RESOURCES_PARAMETER); - - if (featureGeneratorResourcesDirName != null) { - featureGeneratorResourceDir = new File(featureGeneratorResourcesDirName); - } - } - - additionalTrainingDataFile = CasConsumerUtil.getOptionalStringParameter( - getUimaContext(), UimaUtil.ADDITIONAL_TRAINING_DATA_FILE); - - // If the additional training data is specified, the encoding must be provided! - if (additionalTrainingDataFile != null) { - additionalTrainingDataEncoding = CasConsumerUtil.getRequiredStringParameter( - getUimaContext(), UimaUtil.ADDITIONAL_TRAINING_DATA_ENCODING); - } - - String sampleTraceFileName = CasConsumerUtil.getOptionalStringParameter( - getUimaContext(), "opennlp.uima.SampleTraceFile"); - - if (sampleTraceFileName != null) { - sampleTraceFile = new File(getUimaContextAdmin().getResourceManager() - .getDataPath() + File.separatorChar + sampleTraceFileName); - sampleTraceFileEncoding = CasConsumerUtil.getRequiredStringParameter( - getUimaContext(), "opennlp.uima.SampleTraceFileEncoding"); - } - } - - /** - * Initialize the current instance with the given type system. - */ - public void typeSystemInit(TypeSystem typeSystem) - throws ResourceInitializationException { - - String sentenceTypeName = - CasConsumerUtil.getRequiredStringParameter(getUimaContext(), - UimaUtil.SENTENCE_TYPE_PARAMETER); - - sentenceType = CasConsumerUtil.getType(typeSystem, sentenceTypeName); - - String tokenTypeName = CasConsumerUtil.getRequiredStringParameter(getUimaContext(), - UimaUtil.TOKEN_TYPE_PARAMETER); - - tokenType = CasConsumerUtil.getType(typeSystem, tokenTypeName); - - String nameTypeName = CasConsumerUtil.getRequiredStringParameter(getUimaContext(), - NameFinder.NAME_TYPE_PARAMETER); - - nameType = CasConsumerUtil.getType(typeSystem, nameTypeName); - } - - /** - * Creates a {@link List} from an {@link Iterator}. - * - * @param <T> - * @param it - * @return - */ - private static <T> List<T> iteratorToList(Iterator<T> it) { - List<T> list = new LinkedList<>(); - - while (it.hasNext()) { - list.add(it.next()); - } - - return list; - } - - private static boolean isContaining(AnnotationFS annotation, - AnnotationFS containtedAnnotation) { - boolean isStartContaining = annotation.getBegin() <= containtedAnnotation.getBegin(); - return isStartContaining && annotation.getEnd() >= containtedAnnotation.getEnd(); - - } - - /** - * Creates the name spans out of a list of token annotations and a list of entity annotations. - * <p> - * The name spans for the name finder use a token index and not on a character index which - * is used by the entity annotations. - * - * @param tokenList - * @param entityAnnotations - * @return - */ - private static Span[] createNames(List<AnnotationFS> tokenList, List<AnnotationFS> entityAnnotations) { - - List<Span> nameList = new LinkedList<>(); - - AnnotationFS currentEntity = null; - - int startIndex = -1; - int index = 0; - for (AnnotationFS token : tokenList) { - for (AnnotationFS entity : entityAnnotations) { - - if (!isContaining(entity, token)) { - // ... end of an entity - if (currentEntity == entity) { - nameList.add(new Span(startIndex, index)); - - startIndex = -1; - currentEntity = null; - // break; - } else { - continue; - } - } - - // is this token start of new entity - if (currentEntity == null && isContaining(entity, token)) { - startIndex = index; - - currentEntity = entity; - } - } - - index++; - } - - if (currentEntity != null) { - Span name = new Span(startIndex, index); - nameList.add(name); - } - - return nameList.toArray(new Span[nameList.size()]); - } - - /* - * Process the given CAS object. - */ - /** - * Process the given CAS object. - */ - public void processCas(CAS cas) { - FSIndex<AnnotationFS> sentenceIndex = cas.getAnnotationIndex(sentenceType); - - boolean isClearAdaptiveData = true; - - for (AnnotationFS sentenceAnnotation : sentenceIndex) { - ContainingConstraint sentenceContainingConstraint = new ContainingConstraint( - sentenceAnnotation); - - FSIndex<AnnotationFS> tokenAnnotations = cas.getAnnotationIndex(tokenType); - - Iterator<AnnotationFS> containingTokens = cas.createFilteredIterator(tokenAnnotations - .iterator(), sentenceContainingConstraint); - - FSIndex<AnnotationFS> allNames = cas.getAnnotationIndex(nameType); - - Iterator<AnnotationFS> containingNames = cas.createFilteredIterator(allNames.iterator(), - sentenceContainingConstraint); - - List<AnnotationFS> tokenList = iteratorToList(containingTokens); - - Span names[] = createNames(tokenList, iteratorToList(containingNames)); - - // create token array - String tokenArray[] = new String[tokenList.size()]; - - for (int i = 0; i < tokenArray.length; i++) { - tokenArray[i] = tokenList.get(i).getCoveredText(); - } - - NameSample trainingSentence = new NameSample(tokenArray, names, null, isClearAdaptiveData); - - if (trainingSentence.getSentence().length != 0) { - nameFinderSamples.add(trainingSentence); - - if (isClearAdaptiveData) { - isClearAdaptiveData = false; - } - } else { - if (logger.isLoggable(Level.INFO)) { - logger.log(Level.INFO, "Sentence without tokens: " + - sentenceAnnotation.getCoveredText()); - } - } - } - } - - /** - * Called if the processing is finished, this method - * does the training. - */ - public void collectionProcessComplete(ProcessTrace trace) - throws ResourceProcessException, IOException { - - if (logger.isLoggable(Level.INFO)) { - logger.log(Level.INFO, "Collected " + nameFinderSamples.size() + - " name samples."); - } - - GIS.PRINT_MESSAGES = false; - - // create training stream ... - ObjectStream<NameSample> samples = ObjectStreamUtils.createObjectStream(nameFinderSamples); - - Writer samplesOut = null; - TokenNameFinderModel nameModel; - try { - if (additionalTrainingDataFile != null) { - - if (logger.isLoggable(Level.INFO)) { - logger.log(Level.INFO, "Using additional training data file: " + additionalTrainingDataFile); - } - - InputStreamFactory additionalTrainingDataIn = new MarkableFileInputStreamFactory( - new File(additionalTrainingDataFile)); - Charset additionalTrainingDataCharset = Charset - .forName(additionalTrainingDataEncoding); - - ObjectStream<NameSample> additionalSamples = new NameSampleDataStream( - new PlainTextByLineStream(additionalTrainingDataIn, - additionalTrainingDataCharset)); - - samples = ObjectStreamUtils.createObjectStream(samples, additionalSamples); - } - - if (sampleTraceFile != null) { - samplesOut = new OutputStreamWriter(new FileOutputStream(sampleTraceFile), sampleTraceFileEncoding); - samples = new SampleTraceStream<>(samples, samplesOut); - } - - Map<String, Object> resourceMap; - - if (featureGeneratorResourceDir != null) { - resourceMap = TokenNameFinderTrainerTool.loadResources(featureGeneratorResourceDir, null); - } - else { - resourceMap = Collections.emptyMap(); - } - - nameModel = NameFinderME.train(language, null, samples, trainingParams, - new TokenNameFinderFactory(featureGeneratorDefinition, resourceMap, new BioCodec())); - } - finally { - - if (samplesOut != null) { - samplesOut.close(); - } - } - - // dereference to allow garbage collection - nameFinderSamples = null; - - File modelFile = new File(getUimaContextAdmin().getResourceManager() - .getDataPath() + File.separatorChar + modelPath); - - OpennlpUtil.serialize(nameModel, modelFile); - - if (logger.isLoggable(Level.INFO)) { - logger.log(Level.INFO, "Model was written to: " + modelFile.getAbsolutePath()); - } - } - - /** - * The trainer is not stateless. - */ - public boolean isStateless() { - return false; - } - - /** - * Destroys the current instance. - */ - public void destroy() { - // dereference to allow garbage collection - nameFinderSamples = null; - } -} http://git-wip-us.apache.org/repos/asf/opennlp/blob/f0020c40/opennlp-uima/src/main/java/opennlp/uima/postag/POSTaggerTrainer.java ---------------------------------------------------------------------- diff --git a/opennlp-uima/src/main/java/opennlp/uima/postag/POSTaggerTrainer.java b/opennlp-uima/src/main/java/opennlp/uima/postag/POSTaggerTrainer.java deleted file mode 100644 index be7651e..0000000 --- a/opennlp-uima/src/main/java/opennlp/uima/postag/POSTaggerTrainer.java +++ /dev/null @@ -1,240 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.uima.postag; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import org.apache.uima.UimaContext; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.FSIndex; -import org.apache.uima.cas.Feature; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.TypeSystem; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.collection.CasConsumer_ImplBase; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.ResourceProcessException; -import org.apache.uima.util.Level; -import org.apache.uima.util.Logger; -import org.apache.uima.util.ProcessTrace; - -import opennlp.tools.ml.maxent.GIS; -import opennlp.tools.postag.POSDictionary; -import opennlp.tools.postag.POSModel; -import opennlp.tools.postag.POSSample; -import opennlp.tools.postag.POSTaggerFactory; -import opennlp.tools.postag.POSTaggerME; -import opennlp.tools.util.ObjectStreamUtils; -import opennlp.tools.util.TrainingParameters; -import opennlp.uima.util.AnnotatorUtil; -import opennlp.uima.util.CasConsumerUtil; -import opennlp.uima.util.ContainingConstraint; -import opennlp.uima.util.OpennlpUtil; -import opennlp.uima.util.UimaUtil; - -/** - * OpenNLP POSTagger trainer. - * <p> - * Mandatory parameters - * <table border=1> - * <caption></caption> - * <tr><th>Type</th> <th>Name</th> <th>Description</th></tr> - * <tr><td>String</td> <td>opennlp.uima.ModelName</td> <td>The name of the model file</td></tr> - * <tr><td>String</td> <td>opennlp.uima.SentenceType</td> <td>The full name of the sentence type</td></tr> - * <tr><td>String</td> <td>opennlp.uima.TokenType</td> <td>The full name of the token type</td></tr> - * <tr><td>String</td> <td>pennlp.uima.POSFeature</td> <td>The name of the token pos feature, - * the feature must be of type String</td></tr> - * <tr><td>String</td> <td>opennlp.uima.TagDictionaryName</td></tr> - * </table> - * - * @deprecated will be removed after 1.7.1 release, there is no replacement - */ -@Deprecated - -public class POSTaggerTrainer extends CasConsumer_ImplBase { - - public static final String TAG_DICTIONARY_NAME = "opennlp.uima.TagDictionaryName"; - - private UimaContext mContext; - - private Type mSentenceType; - - private Type mTokenType; - - private String mModelName; - - private Feature mPOSFeature; - - private Logger mLogger; - - private List<POSSample> mPOSSamples = new ArrayList<>(); - - private String language; - - private POSDictionary tagDictionary; - - /** - * Initializes the current instance. - */ - public void initialize() throws ResourceInitializationException { - - super.initialize(); - - mContext = getUimaContext(); - - mLogger = mContext.getLogger(); - - if (mLogger.isLoggable(Level.INFO)) { - mLogger.log(Level.INFO, "Initializing the OpenNLP " + - "POSTagger trainer."); - } - - mModelName = CasConsumerUtil.getRequiredStringParameter(mContext, - UimaUtil.MODEL_PARAMETER); - - language = CasConsumerUtil.getRequiredStringParameter(mContext, - UimaUtil.LANGUAGE_PARAMETER); - - String tagDictionaryName = CasConsumerUtil.getOptionalStringParameter(mContext, - TAG_DICTIONARY_NAME); - - if (tagDictionaryName != null) { - try (InputStream dictIn = AnnotatorUtil.getResourceAsStream(mContext, tagDictionaryName)) { - tagDictionary = POSDictionary.create(dictIn); - } catch (final IOException e) { - // if this fails just print error message and continue - final String message = "IOException during tag dictionary reading, " - + "running without tag dictionary: " + e.getMessage(); - - if (this.mLogger.isLoggable(Level.WARNING)) { - this.mLogger.log(Level.WARNING, message); - } - } - } - } - - /** - * Initialize the current instance with the given type system. - */ - public void typeSystemInit(TypeSystem typeSystem) - throws ResourceInitializationException { - String sentenceTypeName = CasConsumerUtil.getRequiredStringParameter(mContext, - UimaUtil.SENTENCE_TYPE_PARAMETER); - - if (mLogger.isLoggable(Level.INFO)) { - mLogger.log(Level.INFO, UimaUtil.SENTENCE_TYPE_PARAMETER + ": " + - sentenceTypeName); - } - - mSentenceType = CasConsumerUtil.getType(typeSystem, sentenceTypeName); - - String tokenTypeName = CasConsumerUtil.getRequiredStringParameter(mContext, - UimaUtil.TOKEN_TYPE_PARAMETER); - - mTokenType = CasConsumerUtil.getType(typeSystem, tokenTypeName); - - String posFeatureName = CasConsumerUtil.getRequiredStringParameter(mContext, - UimaUtil.POS_FEATURE_PARAMETER); - - mPOSFeature = mTokenType.getFeatureByBaseName(posFeatureName); - } - - /** - * Process the given CAS object. - */ - public void processCas(CAS cas) { - - FSIndex<AnnotationFS> sentenceAnnotations = cas.getAnnotationIndex(mSentenceType); - - for (AnnotationFS sentence : sentenceAnnotations) { - process(cas, sentence); - } - } - - private void process(CAS tcas, AnnotationFS sentence) { - - FSIndex<AnnotationFS> allTokens = tcas.getAnnotationIndex(mTokenType); - - ContainingConstraint containingConstraint = - new ContainingConstraint(sentence); - - List<String> tokens = new ArrayList<>(); - List<String> tags = new ArrayList<>(); - - Iterator<AnnotationFS> containingTokens = tcas.createFilteredIterator( - allTokens.iterator(), containingConstraint); - - while (containingTokens.hasNext()) { - - AnnotationFS tokenAnnotation = containingTokens.next(); - - String tag = tokenAnnotation.getFeatureValueAsString(mPOSFeature); - - tokens.add(tokenAnnotation.getCoveredText().trim()); - tags.add(tag); - } - - mPOSSamples.add(new POSSample(tokens, tags)); - } - - /** - * Called if the processing is finished, this method - * does the training. - */ - public void collectionProcessComplete(ProcessTrace trace) - throws ResourceProcessException, IOException { - - GIS.PRINT_MESSAGES = false; - - TrainingParameters params = new TrainingParameters(); - params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100)); - params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(5)); - - POSModel posTaggerModel = POSTaggerME.train(language, - ObjectStreamUtils.createObjectStream(mPOSSamples), - params, new POSTaggerFactory(null, tagDictionary)); - - // dereference to allow garbage collection - mPOSSamples = null; - - File modelFile = new File(getUimaContextAdmin().getResourceManager() - .getDataPath() + File.separatorChar + mModelName); - - OpennlpUtil.serialize(posTaggerModel, modelFile); - } - - /** - * The trainer is not stateless. - */ - public boolean isStateless() { - return false; - } - - /** - * Releases allocated resources. - */ - public void destroy() { - // dereference to allow garbage collection - mPOSSamples = null; - } -} http://git-wip-us.apache.org/repos/asf/opennlp/blob/f0020c40/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java ---------------------------------------------------------------------- diff --git a/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java b/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java deleted file mode 100644 index 99600b8..0000000 --- a/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.uima.sentdetect; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.io.Writer; -import java.util.ArrayList; -import java.util.List; - -import org.apache.uima.UimaContext; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.FSIndex; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.TypeSystem; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.collection.CasConsumer_ImplBase; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.ResourceProcessException; -import org.apache.uima.util.Level; -import org.apache.uima.util.Logger; -import org.apache.uima.util.ProcessTrace; - -import opennlp.tools.ml.maxent.GIS; -import opennlp.tools.sentdetect.SentenceDetectorFactory; -import opennlp.tools.sentdetect.SentenceDetectorME; -import opennlp.tools.sentdetect.SentenceModel; -import opennlp.tools.sentdetect.SentenceSample; -import opennlp.tools.util.ObjectStream; -import opennlp.tools.util.ObjectStreamUtils; -import opennlp.tools.util.Span; -import opennlp.tools.util.TrainingParameters; -import opennlp.tools.util.model.ModelUtil; -import opennlp.uima.util.CasConsumerUtil; -import opennlp.uima.util.OpennlpUtil; -import opennlp.uima.util.SampleTraceStream; -import opennlp.uima.util.UimaUtil; - -/** - * OpenNLP SentenceDetector trainer. - * <p> - * Mandatory parameters - * <table border=1> - * <caption></caption> - * <tr><th>Type</th> <th>Name</th> <th>Description</th></tr> - * <tr><td>String</td> <td>opennlp.uima.ModelName</td> <td>The name of the model file</td></tr> - * <tr><td>String</td> <td>opennlp.uima.SentenceType</td> <td>The full name of the sentence type</td></tr> - * <tr><td>String</td> <td>opennlp.uima.EOSChars</td> - * <td>A string containing end-of-sentence characters</td></tr> - * </table> - * - * @deprecated will be removed after 1.7.1 release, there is no replacement - */ -@Deprecated -public final class SentenceDetectorTrainer extends CasConsumer_ImplBase { - - private List<SentenceSample> sentenceSamples = new ArrayList<>(); - - private Type mSentenceType; - - private String mModelName; - - private String language = "en"; - - private UimaContext mContext; - - private String eosChars; - - private File sampleTraceFile; - - private String sampleTraceFileEncoding; - - /** - * Initializes the current instance. - */ - public void initialize() throws ResourceInitializationException { - - super.initialize(); - - mContext = getUimaContext(); - - Logger mLogger = mContext.getLogger(); - - if (mLogger.isLoggable(Level.INFO)) { - mLogger.log(Level.INFO, "Initializing the OpenNLP SentenceDetector " + - "trainer."); - } - - mModelName = CasConsumerUtil.getRequiredStringParameter(mContext, - UimaUtil.MODEL_PARAMETER); - - language = CasConsumerUtil.getRequiredStringParameter(mContext, - UimaUtil.LANGUAGE_PARAMETER); - - eosChars = CasConsumerUtil.getOptionalStringParameter(mContext, "opennlp.uima.EOSChars"); - - - String sampleTraceFileName = CasConsumerUtil.getOptionalStringParameter( - getUimaContext(), "opennlp.uima.SampleTraceFile"); - - if (sampleTraceFileName != null) { - sampleTraceFile = new File(getUimaContextAdmin().getResourceManager() - .getDataPath() + File.separatorChar + sampleTraceFileName); - sampleTraceFileEncoding = CasConsumerUtil.getRequiredStringParameter( - getUimaContext(), "opennlp.uima.SampleTraceFileEncoding"); - } - } - - /** - * Initializes the current instance with the given type system. - */ - public void typeSystemInit(TypeSystem typeSystem) - throws ResourceInitializationException { - - String sentenceTypeName = - CasConsumerUtil.getRequiredStringParameter(mContext, - UimaUtil.SENTENCE_TYPE_PARAMETER); - - mSentenceType = CasConsumerUtil.getType(typeSystem, sentenceTypeName); - } - - /** - * Process the given CAS object. - */ - public void processCas(CAS cas) { - - FSIndex<AnnotationFS> sentenceIndex = cas.getAnnotationIndex(mSentenceType); - - Span[] sentSpans = new Span[sentenceIndex.size()]; - - int i = 0; - for (AnnotationFS sentenceAnnotation : sentenceIndex) { - sentSpans[i++] = new Span(sentenceAnnotation.getBegin(), sentenceAnnotation.getEnd()); - } - - // TODO: The line cleaning should be done more carefully - sentenceSamples.add(new SentenceSample(cas.getDocumentText().replace('\n', ' '), sentSpans)); - } - - /** - * Called if the processing is finished, this method - * does the training. - */ - public void collectionProcessComplete(ProcessTrace trace) - throws ResourceProcessException, IOException { - GIS.PRINT_MESSAGES = false; - - char eos[] = null; - if (eosChars != null) { - eos = eosChars.toCharArray(); - } - - SentenceDetectorFactory sdFactory = SentenceDetectorFactory.create( - null, language, true, null, eos); - - // TrainingParameters mlParams = ModelUtil.createTrainingParameters(100, 5); - TrainingParameters mlParams = ModelUtil.createDefaultTrainingParameters(); - ObjectStream<SentenceSample> samples = ObjectStreamUtils.createObjectStream(sentenceSamples); - - Writer samplesOut; - - if (sampleTraceFile != null) { - samplesOut = new OutputStreamWriter(new FileOutputStream(sampleTraceFile), sampleTraceFileEncoding); - samples = new SampleTraceStream<>(samples, samplesOut); - } - - SentenceModel sentenceModel = SentenceDetectorME.train(language, samples, - sdFactory, mlParams); - - // dereference to allow garbage collection - sentenceSamples = null; - - File modelFile = new File(getUimaContextAdmin().getResourceManager() - .getDataPath() + File.separatorChar + mModelName); - - OpennlpUtil.serialize(sentenceModel, modelFile); - } - - /** - * The trainer is not stateless. - */ - public boolean isStateless() { - return false; - } - - /** - * Releases allocated resources. - */ - public void destroy() { - // dereference to allow garbage collection - sentenceSamples = null; - } -} http://git-wip-us.apache.org/repos/asf/opennlp/blob/f0020c40/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java ---------------------------------------------------------------------- diff --git a/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java b/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java deleted file mode 100644 index 35f24a2..0000000 --- a/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java +++ /dev/null @@ -1,294 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.uima.tokenize; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.io.Writer; -import java.nio.charset.Charset; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; - -import org.apache.uima.UimaContext; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.FSIndex; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.TypeSystem; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.collection.CasConsumer_ImplBase; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.ResourceProcessException; -import org.apache.uima.util.Level; -import org.apache.uima.util.Logger; -import org.apache.uima.util.ProcessTrace; - -import opennlp.tools.ml.maxent.GIS; -import opennlp.tools.tokenize.TokenSample; -import opennlp.tools.tokenize.TokenSampleStream; -import opennlp.tools.tokenize.TokenizerFactory; -import opennlp.tools.tokenize.TokenizerME; -import opennlp.tools.tokenize.TokenizerModel; -import opennlp.tools.util.InputStreamFactory; -import opennlp.tools.util.MarkableFileInputStreamFactory; -import opennlp.tools.util.ObjectStream; -import opennlp.tools.util.ObjectStreamUtils; -import opennlp.tools.util.PlainTextByLineStream; -import opennlp.tools.util.Span; -import opennlp.tools.util.model.ModelUtil; -import opennlp.uima.util.CasConsumerUtil; -import opennlp.uima.util.ContainingConstraint; -import opennlp.uima.util.OpennlpUtil; -import opennlp.uima.util.SampleTraceStream; -import opennlp.uima.util.UimaUtil; - -/** - * OpenNLP Tokenizer trainer. - * <p> - * Mandatory parameters - * <table border=1> - * <caption></caption> - * <tr><th>Type</th> <th>Name</th> <th>Description</th></tr> - * <tr><td>String</td> <td>opennlp.uima.ModelName</td> <td>The name of the model file</td></tr> - * <tr><td>String</td> <td>opennlp.uima.SentenceType</td> <td>The full name of the sentence type</td></tr> - * <tr><td>String</td> <td>opennlp.uima.TokenType</td> <td>The full name of the token type</td></tr> - * </table> - * <p> - * Optional parameters - * <table border=1> - * <caption></caption> - * <tr><th>Type</th> <th>Name</th> <th>Description</th></tr> - * <tr><td>Boolean</td> <td>opennlp.uima.tokenizer.IsSkipAlphaNumerics</td></tr> - * </table> - * - * @deprecated will be removed after 1.7.1 release, there is no replacement - */ -@Deprecated - -public final class TokenizerTrainer extends CasConsumer_ImplBase { - - private static final String IS_ALPHA_NUMERIC_OPTIMIZATION = - "opennlp.uima.tokenizer.IsAlphaNumericOptimization"; - - private List<TokenSample> tokenSamples = new ArrayList<>(); - - private UimaContext mContext; - - private Type mSentenceType; - - private Type mTokenType; - - private String mModelName; - - private String additionalTrainingDataFile; - - private String additionalTrainingDataEncoding; - - private String language; - - private Boolean isSkipAlphaNumerics; - - private Logger mLogger; - - private String sampleTraceFileEncoding; - - private File sampleTraceFile; - - /** - * Initializes the current instance. - */ - public void initialize() throws ResourceInitializationException { - - super.initialize(); - - mContext = getUimaContext(); - - mLogger = mContext.getLogger(); - - if (mLogger.isLoggable(Level.INFO)) { - mLogger.log(Level.INFO, "Initializing the OpenNLP Tokenizer trainer."); - } - - mModelName = CasConsumerUtil.getRequiredStringParameter(mContext, - UimaUtil.MODEL_PARAMETER); - - language = CasConsumerUtil.getRequiredStringParameter(mContext, - UimaUtil.LANGUAGE_PARAMETER); - - isSkipAlphaNumerics = - CasConsumerUtil.getOptionalBooleanParameter( - mContext, IS_ALPHA_NUMERIC_OPTIMIZATION); - - if (isSkipAlphaNumerics == null) { - isSkipAlphaNumerics = false; - } - - additionalTrainingDataFile = CasConsumerUtil.getOptionalStringParameter( - getUimaContext(), UimaUtil.ADDITIONAL_TRAINING_DATA_FILE); - - // If the additional training data is specified, the encoding must be provided! - if (additionalTrainingDataFile != null) { - additionalTrainingDataEncoding = CasConsumerUtil.getRequiredStringParameter( - getUimaContext(), UimaUtil.ADDITIONAL_TRAINING_DATA_ENCODING); - } - - String sampleTraceFileName = CasConsumerUtil.getOptionalStringParameter( - getUimaContext(), "opennlp.uima.SampleTraceFile"); - - if (sampleTraceFileName != null) { - sampleTraceFile = new File(getUimaContextAdmin().getResourceManager() - .getDataPath() + File.separatorChar + sampleTraceFileName); - sampleTraceFileEncoding = CasConsumerUtil.getRequiredStringParameter( - getUimaContext(), "opennlp.uima.SampleTraceFileEncoding"); - } - } - - /** - * Initialize the current instance with the given type system. - */ - public void typeSystemInit(TypeSystem typeSystem) - throws ResourceInitializationException { - - String sentenceTypeName = CasConsumerUtil.getRequiredStringParameter(mContext, - UimaUtil.SENTENCE_TYPE_PARAMETER); - - mSentenceType = CasConsumerUtil.getType(typeSystem, sentenceTypeName); - - String tokenTypeName = CasConsumerUtil.getRequiredStringParameter(mContext, - UimaUtil.TOKEN_TYPE_PARAMETER); - - mTokenType = CasConsumerUtil.getType(typeSystem, tokenTypeName); - } - - /** - * Process the given CAS object. - */ - public void processCas(CAS cas) { - - FSIndex<AnnotationFS> sentenceAnnotations = cas.getAnnotationIndex(mSentenceType); - - for (AnnotationFS sentence : sentenceAnnotations) { - process(cas, sentence); - } - } - - private void process(CAS tcas, AnnotationFS sentence) { - FSIndex<AnnotationFS> allTokens = tcas.getAnnotationIndex(mTokenType); - - ContainingConstraint containingConstraint = - new ContainingConstraint(sentence); - - Iterator<AnnotationFS> containingTokens = tcas.createFilteredIterator( - allTokens.iterator(), containingConstraint); - - List<Span> openNLPSpans = new LinkedList<>(); - - while (containingTokens.hasNext()) { - AnnotationFS tokenAnnotation = containingTokens.next(); - - openNLPSpans.add(new Span(tokenAnnotation.getBegin() - - sentence.getBegin(), tokenAnnotation.getEnd() - - sentence.getBegin())); - } - - Span[] spans = openNLPSpans.toArray(new Span[openNLPSpans.size()]); - - Arrays.sort(spans); - - tokenSamples.add(new TokenSample(sentence.getCoveredText(), spans)); - } - - /** - * Called if the processing is finished, this method - * does the training. - */ - public void collectionProcessComplete(ProcessTrace arg0) - throws ResourceProcessException, IOException { - - if (mLogger.isLoggable(Level.INFO)) { - mLogger.log(Level.INFO, "Collected " + tokenSamples.size() + - " token samples."); - } - - GIS.PRINT_MESSAGES = false; - - ObjectStream<TokenSample> samples = ObjectStreamUtils.createObjectStream(tokenSamples); - - // Write stream to disk ... - // if trace file - // serialize events ... - - Writer samplesOut; - TokenizerModel tokenModel; - - if (additionalTrainingDataFile != null) { - - if (mLogger.isLoggable(Level.INFO)) { - mLogger.log(Level.INFO, "Using addional training data file: " + additionalTrainingDataFile); - } - - InputStreamFactory additionalTrainingDataIn = new MarkableFileInputStreamFactory( - new File(additionalTrainingDataFile)); - - Charset additionalTrainingDataCharset = Charset - .forName(additionalTrainingDataEncoding); - - ObjectStream<TokenSample> additionalSamples = new TokenSampleStream( - new PlainTextByLineStream(additionalTrainingDataIn, - additionalTrainingDataCharset)); - - samples = ObjectStreamUtils.createObjectStream(samples, additionalSamples); - } - - if (sampleTraceFile != null) { - samplesOut = new OutputStreamWriter(new FileOutputStream(sampleTraceFile), sampleTraceFileEncoding); - samples = new SampleTraceStream<>(samples, samplesOut); - } - - tokenModel = TokenizerME.train(samples, - TokenizerFactory.create(null, language, null, isSkipAlphaNumerics, null), - ModelUtil.createDefaultTrainingParameters()); - - // dereference to allow garbage collection - tokenSamples = null; - - File modelFile = new File(getUimaContextAdmin().getResourceManager() - .getDataPath() + File.separatorChar + mModelName); - - OpennlpUtil.serialize(tokenModel, modelFile); - } - - /** - * The trainer is not stateless. - */ - public boolean isStateless() { - return false; - } - - /** - * Releases allocated resources. - */ - public void destroy() { - // dereference to allow garbage collection - tokenSamples = null; - } -}