Author: joern
Date: Wed May 25 13:44:45 2016
New Revision: 1745485
URL: http://svn.apache.org/viewvc?rev=1745485&view=rev
Log:
OPENNLP-850 Add ner brat annotation service
Added:
opennlp/sandbox/opennlp-brat-annotator/
opennlp/sandbox/opennlp-brat-annotator/pom.xml (with props)
opennlp/sandbox/opennlp-brat-annotator/src/
opennlp/sandbox/opennlp-brat-annotator/src/main/
opennlp/sandbox/opennlp-brat-annotator/src/main/java/
opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/
opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/
opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratAnnService.java
opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratNameFinderResource.java
opennlp/sandbox/opennlp-brat-annotator/src/test/
opennlp/sandbox/opennlp-brat-annotator/src/test/java/
opennlp/sandbox/opennlp-brat-annotator/src/test/java/opennlp/
opennlp/sandbox/opennlp-brat-annotator/src/test/java/opennlp/bratannotator/
Modified:
opennlp/sandbox/mallet-addon/params/crf-params.txt
opennlp/sandbox/mallet-addon/params/maxent-params.txt
opennlp/sandbox/mallet-addon/pom.xml
opennlp/sandbox/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java
opennlp/sandbox/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java
Modified: opennlp/sandbox/mallet-addon/params/crf-params.txt
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/mallet-addon/params/crf-params.txt?rev=1745485&r1=1745484&r2=1745485&view=diff
==============================================================================
--- opennlp/sandbox/mallet-addon/params/crf-params.txt (original)
+++ opennlp/sandbox/mallet-addon/params/crf-params.txt Wed May 25 13:44:45 2016
@@ -15,6 +15,6 @@
# Sample machine learning properties file
Algorithm=opennlp.addons.mallet.CRFTrainer
-Cutoff=0
+Cutoff=2
Iterations=100
Modified: opennlp/sandbox/mallet-addon/params/maxent-params.txt
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/mallet-addon/params/maxent-params.txt?rev=1745485&r1=1745484&r2=1745485&view=diff
==============================================================================
--- opennlp/sandbox/mallet-addon/params/maxent-params.txt (original)
+++ opennlp/sandbox/mallet-addon/params/maxent-params.txt Wed May 25 13:44:45
2016
@@ -14,7 +14,8 @@
# limitations under the License.
# Sample machine learning properties file
-Algorithm=opennlp.addons.mallet.MaxentTrainer
+#Algorithm=opennlp.addons.mallet.MaxentTrainer
+Algorithm=PERCEPTRON
Cutoff=0
Iterations=100
-
+#BeamSize=5
Modified: opennlp/sandbox/mallet-addon/pom.xml
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/mallet-addon/pom.xml?rev=1745485&r1=1745484&r2=1745485&view=diff
==============================================================================
Binary files - no diff available.
Modified:
opennlp/sandbox/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java?rev=1745485&r1=1745484&r2=1745485&view=diff
==============================================================================
---
opennlp/sandbox/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java
(original)
+++
opennlp/sandbox/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java
Wed May 25 13:44:45 2016
@@ -127,21 +127,20 @@ public class CRFTrainer extends Abstract
// CRFOptimizableBy* objects (terms in the objective function)
// objective 1: label likelihood objective
- CRFTrainerByLabelLikelihood crfTrainer = new CRFTrainerByLabelLikelihood(
- crf);
- crfTrainer.setGaussianPriorVariance(1.0);
-
-// CRFOptimizableByLabelLikelihood optLabel = new
-// CRFOptimizableByLabelLikelihood(crf, trainingData);
-//
+// CRFTrainerByLabelLikelihood crfTrainer = new
CRFTrainerByLabelLikelihood(crf);
+// crfTrainer.setGaussianPriorVariance(1.0);
+
+ CRFOptimizableByLabelLikelihood optLabel = new
+ CRFOptimizableByLabelLikelihood(crf, trainingData);
+
// // CRF trainer
-// Optimizable.ByGradientValue[] opts = new Optimizable.ByGradientValue[] {
-// optLabel };
+ Optimizable.ByGradientValue[] opts = new Optimizable.ByGradientValue[] {
+ optLabel };
- // by default, use L-BFGS as the optimizer
-// CRFTrainerByValueGradients crfTrainer = new CRFTrainerByValueGradients(
-// crf, opts);
-// crfTrainer.setMaxResets(0);
+// by default, use L-BFGS as the optimizer
+ CRFTrainerByValueGradients crfTrainer = new CRFTrainerByValueGradients(
+ crf, opts);
+ crfTrainer.setMaxResets(0);
// SNIP
Modified:
opennlp/sandbox/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java?rev=1745485&r1=1745484&r2=1745485&view=diff
==============================================================================
---
opennlp/sandbox/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java
(original)
+++
opennlp/sandbox/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java
Wed May 25 13:44:45 2016
@@ -27,8 +27,17 @@ import java.util.Map;
import opennlp.tools.ml.AbstractEventTrainer;
import opennlp.tools.ml.model.DataIndexer;
import opennlp.tools.ml.model.MaxentModel;
+import cc.mallet.classify.C45Trainer;
import cc.mallet.classify.Classifier;
+import cc.mallet.classify.MaxEntGETrainer;
+import cc.mallet.classify.MaxEntL1Trainer;
+import cc.mallet.classify.MaxEntPRTrainer;
import cc.mallet.classify.MaxEntTrainer;
+import cc.mallet.classify.NaiveBayes;
+import cc.mallet.classify.NaiveBayesEMTrainer;
+import cc.mallet.classify.NaiveBayesTrainer;
+import cc.mallet.optimize.LimitedMemoryBFGS;
+import cc.mallet.optimize.Optimizer;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.Instance;
@@ -67,22 +76,21 @@ public class MaxentTrainer extends Abstr
weights[featureIndex] = indexer.getNumTimesEventsSeen()[contextIndex];
}
- FeatureVector fv = new FeatureVector(dataAlphabet, malletFeatures,
- weights);
+ FeatureVector fv = new FeatureVector(dataAlphabet, malletFeatures,
weights);
Instance inst = new Instance(fv, targetAlphabet.lookupLabel(
- indexer.getOutcomeLabels()[outcomes[contextIndex]], true), "name",
+ indexer.getOutcomeLabels()[outcomes[contextIndex]], true), "fid:" +
contextIndex,
"data-indexer");
instances.add(inst);
}
InstanceList trainingData = new InstanceList(dataAlphabet, targetAlphabet);
- Instance inst = instances.iterator().next();
-
- Alphabet.alphabetsMatch(trainingData, inst);
+
trainingData.addAll(instances);
MaxEntTrainer trainer = new MaxEntTrainer();
-
+// trainer.setGaussianPriorVariance(1d);
+// trainer.setNumIterations(100);
+
Classifier classifier = trainer.train(trainingData);
return new ClassifierModel(classifier);
Added: opennlp/sandbox/opennlp-brat-annotator/pom.xml
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-brat-annotator/pom.xml?rev=1745485&view=auto
==============================================================================
Binary file - no diff available.
Propchange: opennlp/sandbox/opennlp-brat-annotator/pom.xml
------------------------------------------------------------------------------
svn:mime-type = application/xml
Added:
opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratAnnService.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratAnnService.java?rev=1745485&view=auto
==============================================================================
---
opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratAnnService.java
(added)
+++
opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratAnnService.java
Wed May 25 13:44:45 2016
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.bratannotator;
+
+import java.io.File;
+import java.net.URI;
+import java.net.URL;
+
+import org.eclipse.jetty.server.Server;
+import org.eclipse.jetty.servlet.ServletContextHandler;
+import org.eclipse.jetty.servlet.ServletHolder;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinder;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.sentdetect.NewlineSentenceDetector;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+
+public class BratAnnService {
+
+ public static SentenceDetector sentenceDetector;
+ public static Tokenizer tokenizer;
+ public static TokenNameFinder nameFinders[];
+
+ public static void main(String[] args) throws Exception {
+
+ if (args.length < 3) {
+ System.out.println("sentenceDetectorURI tokenizerURI namefinderURI_1 ...
nameFinderURI_n");
+ return;
+ }
+
+ URI sentenceDetectorUri = URI.create(args[0]);
+ if ("sentenceDetector".equals(sentenceDetectorUri.getScheme())) {
+
+ if ("newline".equals(sentenceDetectorUri.getSchemeSpecificPart())) {
+ sentenceDetector = new NewlineSentenceDetector();
+ }
+ else {
+ System.out.println("unkown sentence detector");
+ return;
+ }
+ }
+ else {
+ sentenceDetector = new SentenceDetectorME(new SentenceModel(new
File(args[0])));
+ }
+
+ URI tokenizerUri = URI.create(args[1]);
+ if ("tokenizer".equals(tokenizerUri.getScheme())) {
+ if ("whitespace".equals(tokenizerUri.getSchemeSpecificPart())) {
+ tokenizer = WhitespaceTokenizer.INSTANCE;
+ }
+ else if ("simple".equals(tokenizerUri.getSchemeSpecificPart())) {
+ tokenizer = SimpleTokenizer.INSTANCE;
+ }
+ else {
+ System.out.println("unkown sentence detector");
+ return;
+ }
+
+ }
+ else {
+ tokenizer = new TokenizerME(new TokenizerModel(new File(args[1])));
+ }
+
+ nameFinders = new TokenNameFinder[] {new NameFinderME(new
TokenNameFinderModel(new URL(args[2])))};
+
+ ServletContextHandler context = new ServletContextHandler(
+ ServletContextHandler.SESSIONS);
+ context.setContextPath("/");
+
+ Server jettyServer = new Server(8080);
+ jettyServer.setHandler(context);
+
+ ServletHolder jerseyServlet = context
+
.addServlet(com.sun.jersey.spi.container.servlet.ServletContainer.class, "/*");
+ jerseyServlet.setInitParameter("com.sun.jersey.config.property.packages",
"opennlp.bratannotator");
+
jerseyServlet.setInitParameter("com.sun.jersey.api.json.POJOMappingFeature",
"true");
+ jerseyServlet.setInitOrder(0);
+
+ jerseyServlet.setInitParameter("jersey.config.server.provider.classnames",
+ BratNameFinderResource.class.getCanonicalName());
+
+ try {
+ jettyServer.start();
+ jettyServer.join();
+ } finally {
+ jettyServer.destroy();
+ }
+ }
+}
Added:
opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratNameFinderResource.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratNameFinderResource.java?rev=1745485&view=auto
==============================================================================
---
opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratNameFinderResource.java
(added)
+++
opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratNameFinderResource.java
Wed May 25 13:44:45 2016
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.bratannotator;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import javax.ws.rs.Consumes;
+import javax.ws.rs.POST;
+import javax.ws.rs.Path;
+import javax.ws.rs.Produces;
+import javax.ws.rs.QueryParam;
+import javax.ws.rs.core.MediaType;
+
+import opennlp.tools.namefind.TokenNameFinder;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.util.Span;
+
+@Path("/ner")
+public class BratNameFinderResource {
+
+ public static class NameAnn {
+ public int[][] offsets;
+ public String[] texts;
+ public String type;
+ }
+
+ private SentenceDetector sentDetect = BratAnnService.sentenceDetector;
+ private Tokenizer tokenizer = BratAnnService.tokenizer;
+ private TokenNameFinder nameFinders[] = BratAnnService.nameFinders;
+
+ private static int findNextNonWhitespaceChar(CharSequence s, int beginOffset,
+ int endOffset) {
+
+ for (int i = beginOffset; i < endOffset; i++) {
+ if (!Character.isSpaceChar(s.charAt(i))) {
+ return i;
+ }
+ }
+
+ return -1;
+ }
+
+ @POST
+ @Consumes(MediaType.TEXT_PLAIN)
+ @Produces(MediaType.APPLICATION_JSON)
+ public Map<String, NameAnn> findNames(@QueryParam("model") String modelName,
+ String text) {
+
+ Span sentenceSpans[] = sentDetect.sentPosDetect(text);
+
+ Map<String, NameAnn> map = new HashMap<String, NameAnn>();
+
+ int indexCounter = 0;
+
+ for (int i = 0; i < sentenceSpans.length; i++) {
+
+ String sentenceText = sentenceSpans[i].getCoveredText(text).toString();
+
+ // offset of sentence gets lost here!
+ Span tokenSpans[] = tokenizer
+ .tokenizePos(sentenceText);
+
+ String tokens[] = Span.spansToStrings(tokenSpans, sentenceText);
+
+ for (TokenNameFinder nameFinder : nameFinders) {
+ Span names[] = nameFinder.find(tokens);
+
+ for (Span name : names) {
+
+ int beginOffset = tokenSpans[name.getStart()].getStart()
+ + sentenceSpans[i].getStart();
+ int endOffset = tokenSpans[name.getEnd() - 1].getEnd()
+ + sentenceSpans[i].getStart();
+
+ // create a list of new line indexes
+ List<Integer> newLineIndexes = new ArrayList<Integer>();
+
+ // TODO: Code needs to handle case that there are multiple new lines
+ // in a row
+
+ boolean inNewLineSequence = false;
+ for (int ci = beginOffset; ci < endOffset; ci++) {
+ if (text.charAt(ci) == '\n' || text.charAt(ci) == '\r') {
+ if (!inNewLineSequence) {
+ newLineIndexes.add(ci);
+ }
+ inNewLineSequence = true;
+ } else {
+ inNewLineSequence = false;
+ }
+ }
+
+ List<String> textSegments = new ArrayList<String>();
+ List<int[]> spanSegments = new ArrayList<int[]>();
+
+ int segmentBegin = beginOffset;
+
+ for (int newLineOffset : newLineIndexes) {
+ // create segment from begin to offset
+ textSegments.add(text.substring(segmentBegin, newLineOffset));
+ spanSegments.add(new int[] { segmentBegin, newLineOffset });
+
+ segmentBegin = findNextNonWhitespaceChar(text, newLineOffset + 1,
+ endOffset);
+
+ if (segmentBegin == -1) {
+ break;
+ }
+ }
+
+ // create left over segment
+ if (segmentBegin != -1) {
+ textSegments.add(text.substring(segmentBegin, endOffset));
+ spanSegments.add(new int[] { segmentBegin, endOffset });
+ }
+
+ NameAnn ann = new NameAnn();
+ ann.texts = textSegments.toArray(new String[textSegments.size()]);
+ ann.offsets = spanSegments.toArray(new int[spanSegments.size()][]);
+ ann.type = name.getType();
+
+ map.put(Integer.toString(indexCounter++), ann);
+ }
+ }
+ }
+
+ return map;
+ }
+}