This is an automated email from the ASF dual-hosted git repository.

rzo1 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git


The following commit(s) were added to refs/heads/master by this push:
     new 0c0af3f  OPENNLP-1634 - Move OpenNLP Brat Annotator back to Sandbox
0c0af3f is described below

commit 0c0af3f6a5ca29c2c127fbead42c1cdd7a801967
Author: Richard Zowalla <[email protected]>
AuthorDate: Tue Oct 29 13:51:45 2024 +0100

    OPENNLP-1634 - Move OpenNLP Brat Annotator back to Sandbox
---
 opennlp-brat-annotator/pom.xml                     | 119 ++++++++++++++++++
 .../src/main/bin/brat-annotation-service           |  56 +++++++++
 .../src/main/bin/brat-annotation-service.bat       |  51 ++++++++
 .../java/opennlp/bratann/NameFinderAnnService.java | 102 +++++++++++++++
 .../java/opennlp/bratann/NameFinderResource.java   | 138 +++++++++++++++++++++
 pom.xml                                            |   1 +
 6 files changed, 467 insertions(+)

diff --git a/opennlp-brat-annotator/pom.xml b/opennlp-brat-annotator/pom.xml
new file mode 100644
index 0000000..75beb96
--- /dev/null
+++ b/opennlp-brat-annotator/pom.xml
@@ -0,0 +1,119 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more 
contributor
+       license agreements. See the NOTICE file distributed with this work for 
additional
+       information regarding copyright ownership. The ASF licenses this file to
+       you under the Apache License, Version 2.0 (the "License"); you may not 
use
+       this file except in compliance with the License. You may obtain a copy 
of
+       the License at http://www.apache.org/licenses/LICENSE-2.0 Unless 
required
+       by applicable law or agreed to in writing, software distributed under 
the
+       License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
CONDITIONS
+       OF ANY KIND, either express or implied. See the License for the specific
+       language governing permissions and limitations under the License. -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"; 
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+       <modelVersion>4.0.0</modelVersion>
+
+       <parent>
+               <groupId>org.apache.opennlp</groupId>
+               <artifactId>opennlp-sandbox</artifactId>
+               <version>2.4.1-SNAPSHOT</version>
+       </parent>
+
+       <artifactId>opennlp-brat-annotator</artifactId>
+       <packaging>jar</packaging>
+
+       <name>Apache OpenNLP Brat Annotator</name>
+
+       <properties>
+               
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+               <jackson.version>2.18.0</jackson.version>
+               <jersey.version>3.1.9</jersey.version>
+       </properties>
+
+       <dependencies>
+               <dependency>
+                       <groupId>org.slf4j</groupId>
+                       <artifactId>slf4j-api</artifactId>
+               </dependency>
+
+               <dependency>
+                       <groupId>org.glassfish.jersey.containers</groupId>
+                       <artifactId>jersey-container-grizzly2-http</artifactId>
+                       <version>${jersey.version}</version>
+               </dependency>
+
+               <dependency>
+                       <groupId>org.glassfish.jersey.media</groupId>
+                       <artifactId>jersey-media-json-jackson</artifactId>
+                       <version>${jersey.version}</version>
+                       <scope>runtime</scope>
+               </dependency>
+
+               <dependency>
+                       <groupId>com.fasterxml.jackson.core</groupId>
+                       <artifactId>jackson-annotations</artifactId>
+                       <version>${jackson.version}</version>
+                       <scope>runtime</scope>
+               </dependency>
+
+               <dependency>
+                       <groupId>com.fasterxml.jackson.core</groupId>
+                       <artifactId>jackson-databind</artifactId>
+                       <version>${jackson.version}</version>
+                       <scope>runtime</scope>
+               </dependency>
+
+               <dependency>
+                       <groupId>com.fasterxml.jackson.module</groupId>
+                       <artifactId>jackson-module-jaxb-annotations</artifactId>
+                       <version>${jackson.version}</version>
+                       <scope>runtime</scope>
+               </dependency>
+
+               <dependency>
+                       <groupId>org.apache.opennlp</groupId>
+                       <artifactId>opennlp-tools</artifactId>
+               </dependency>
+
+               <dependency>
+                       <groupId>org.junit.jupiter</groupId>
+                       <artifactId>junit-jupiter-api</artifactId>
+                       <scope>test</scope>
+               </dependency>
+
+               <dependency>
+                       <groupId>org.junit.jupiter</groupId>
+                       <artifactId>junit-jupiter-engine</artifactId>
+                       <scope>test</scope>
+               </dependency>
+
+               <dependency>
+                       <groupId>org.slf4j</groupId>
+                       <artifactId>slf4j-simple</artifactId>
+                       <version>${slf4j.version}</version>
+                       <scope>test</scope>
+               </dependency>
+       </dependencies>
+       <build>
+               <plugins>
+                       <plugin>
+                               <artifactId>maven-assembly-plugin</artifactId>
+                               <configuration>
+                                       <descriptorRefs>
+                                               
<descriptorRef>jar-with-dependencies</descriptorRef>
+                                       </descriptorRefs>
+                               </configuration>
+                               <executions>
+                                       <execution>
+                                               <id>make-assembly</id>
+                                               <phase>package</phase>
+                                               <goals>
+                                                       <goal>single</goal>
+                                               </goals>
+                                       </execution>
+                               </executions>
+                       </plugin>
+               </plugins>
+       </build>
+</project>
diff --git a/opennlp-brat-annotator/src/main/bin/brat-annotation-service 
b/opennlp-brat-annotator/src/main/bin/brat-annotation-service
new file mode 100755
index 0000000..eac9566
--- /dev/null
+++ b/opennlp-brat-annotator/src/main/bin/brat-annotation-service
@@ -0,0 +1,56 @@
+#!/bin/sh
+
+#   Licensed to the Apache Software Foundation (ASF) under one
+#   or more contributor license agreements.  See the NOTICE file
+#   distributed with this work for additional information
+#   regarding copyright ownership.  The ASF licenses this file
+#   to you under the Apache License, Version 2.0 (the
+#   "License"); you may not use this file except in compliance
+#   with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing,
+#   software distributed under the License is distributed on an
+#   #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#   KIND, either express or implied.  See the License for the
+#   specific language governing permissions and limitations
+#   under the License.
+
+# Note:  Do not output anything in this script file, any output
+#        may be inadvertantly placed in any output files if
+#        output redirection is used.
+
+# determine OPENNLP_HOME - $0 may be a symlink to OpenNLP's home
+PRG="$0"
+
+while [ -h "$PRG" ] ; do
+  ls=`ls -ld "$PRG"`
+  link=`expr "$ls" : '.*-> \(.*\)$'`
+  if expr "$link" : '/.*' > /dev/null; then
+    PRG="$link"
+  else
+    PRG="`dirname "$PRG"`/$link"
+  fi
+done
+
+saveddir=`pwd`
+
+OPENNLP_HOME=`dirname "$PRG"`/..
+
+# make it fully qualified
+OPENNLP_HOME=`cd "$OPENNLP_HOME" && pwd`
+
+cd "$saveddir"
+
+if [ -z "$JAVACMD" ] ; then
+  if [ -n "$JAVA_HOME"  ] ; then
+    JAVACMD="$JAVA_HOME/bin/java"
+  else
+    JAVACMD="`which java`"
+  fi
+fi
+
+CLASSPATH=$(echo $OPENNLP_HOME/lib/*.jar | tr ' ' ':')
+
+$JAVACMD -Xmx1024m -Dlog4j.configurationFile="$OPENNLP_HOME/conf/log4j2.xml" 
-cp "$CLASSPATH" opennlp.bratann.NameFinderAnnService $@
diff --git a/opennlp-brat-annotator/src/main/bin/brat-annotation-service.bat 
b/opennlp-brat-annotator/src/main/bin/brat-annotation-service.bat
new file mode 100755
index 0000000..289248b
--- /dev/null
+++ b/opennlp-brat-annotator/src/main/bin/brat-annotation-service.bat
@@ -0,0 +1,51 @@
+@ECHO off
+
+REM #   Licensed to the Apache Software Foundation (ASF) under one
+REM #   or more contributor license agreements.  See the NOTICE file
+REM #   distributed with this work for additional information
+REM #   regarding copyright ownership.  The ASF licenses this file
+REM #   to you under the Apache License, Version 2.0 (the
+REM #   "License"); you may not use this file except in compliance
+REM #   with the License.  You may obtain a copy of the License at
+REM #
+REM #    http://www.apache.org/licenses/LICENSE-2.0
+REM #
+REM #   Unless required by applicable law or agreed to in writing,
+REM #   software distributed under the License is distributed on an
+REM #   #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+REM #   KIND, either express or implied.  See the License for the
+REM #   specific language governing permissions and limitations
+REM #   under the License.
+
+REM # Note:  Do not output anything in this script file, any output
+REM #        may be inadvertantly placed in any output files if
+REM #        output redirection is used.
+SETLOCAL
+
+IF "%JAVA_CMD%" == "" (
+       IF "%JAVA_HOME%" == "" (
+               SET JAVA_CMD=java 
+       ) ELSE (
+               REM # Keep JAVA_HOME to short-name without spaces
+               FOR %%A IN ("%JAVA_HOME%") DO SET JAVA_CMD=%%~sfA\bin\java
+       )
+)
+
+REM #  Should work with Windows XP and greater.  If not, specify the path to 
where it is installed.
+IF "%OPENNLP_HOME%" == "" (
+       SET OPENNLP_HOME=%~sp0..
+) ELSE (
+       REM # Keep OPENNLP_HOME to short-name without spaces
+       FOR %%A IN ("%OPENNLP_HOME%") DO SET OPENNLP_HOME=%%~sfA
+)
+setLocal EnableDelayedExpansion
+set CLASSPATH="
+
+FOR %%A IN ("%OPENNLP_HOME%\lib\*.jar") DO (
+       set CLASSPATH=!CLASSPATH!;%%A
+)
+set CLASSPATH=!CLASSPATH!"
+
+%JAVA_CMD% -Xmx1024m 
"-Dlog4j.configurationFile=%OPENNLP_HOME%\conf\log4j2.xml" -cp %CLASSPATH% 
opennlp.bratann.NameFinderAnnService %*
+
+ENDLOCAL
\ No newline at end of file
diff --git 
a/opennlp-brat-annotator/src/main/java/opennlp/bratann/NameFinderAnnService.java
 
b/opennlp-brat-annotator/src/main/java/opennlp/bratann/NameFinderAnnService.java
new file mode 100644
index 0000000..1735cb8
--- /dev/null
+++ 
b/opennlp-brat-annotator/src/main/java/opennlp/bratann/NameFinderAnnService.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.bratann;
+
+import java.io.File;
+import java.net.URI;
+import java.util.Arrays;
+import java.util.List;
+
+import jakarta.ws.rs.core.UriBuilder;
+import org.glassfish.jersey.grizzly2.httpserver.GrizzlyHttpServerFactory;
+import org.glassfish.jersey.server.ResourceConfig;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinder;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.sentdetect.NewlineSentenceDetector;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+
+public class NameFinderAnnService {
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(NameFinderAnnService.class);
+  static SentenceDetector sentenceDetector = new NewlineSentenceDetector();
+  static Tokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
+  static TokenNameFinder[] nameFinders;
+
+  public static void main(String[] args) throws Exception {
+
+    if (args.length == 0) {
+      LOG.info("Usage:");
+      LOG.info("[NameFinderAnnService -serverPort port] [-tokenizerModel file] 
"
+          + "[-ruleBasedTokenizer whitespace|simple] "
+          + "[-sentenceDetectorModel file] namefinderFile|nameFinderURI");
+      return;
+    }
+
+    List<String> argList = Arrays.asList(args);
+
+    int serverPort = 8080;
+    int serverPortIndex = argList.indexOf("-serverPort") + 1;
+
+    if (serverPortIndex > 0 && serverPortIndex < args.length) {
+      serverPort = Integer.parseInt(args[serverPortIndex]);
+    }
+
+    int sentenceModelIndex = argList.indexOf("-sentenceDetectorModel") + 1;
+    if (sentenceModelIndex > 0 && sentenceModelIndex < args.length) {
+      sentenceDetector = new SentenceDetectorME(
+          new SentenceModel(new File(args[sentenceModelIndex])));
+    }
+
+    int ruleBasedTokenizerIndex = argList.indexOf("-ruleBasedTokenizer") + 1;
+
+    if (ruleBasedTokenizerIndex > 0 && ruleBasedTokenizerIndex < args.length) {
+      if ("whitespace".equals(args[ruleBasedTokenizerIndex])) {
+        tokenizer = WhitespaceTokenizer.INSTANCE;
+      } else if ("simple".equals(args[ruleBasedTokenizerIndex])) {
+        tokenizer = SimpleTokenizer.INSTANCE;
+      } else {
+        LOG.error("unknown tokenizer: {}", args[ruleBasedTokenizerIndex]);
+        return;
+      }
+    }
+
+    int tokenizerModelIndex = argList.indexOf("-tokenizerModel") + 1;
+    if (tokenizerModelIndex > 0 && tokenizerModelIndex < args.length) {
+      tokenizer = new TokenizerME(
+          new TokenizerModel(new File(args[tokenizerModelIndex])));
+    }
+
+    nameFinders = new TokenNameFinder[] {new NameFinderME(
+        new TokenNameFinderModel(new File(args[args.length - 1])))};
+
+    URI baseUri = 
UriBuilder.fromUri("http://localhost/";).port(serverPort).build();
+    ResourceConfig config = new ResourceConfig(NameFinderResource.class);
+    GrizzlyHttpServerFactory.createHttpServer(baseUri, config);
+  }
+}
diff --git 
a/opennlp-brat-annotator/src/main/java/opennlp/bratann/NameFinderResource.java 
b/opennlp-brat-annotator/src/main/java/opennlp/bratann/NameFinderResource.java
new file mode 100644
index 0000000..f824c18
--- /dev/null
+++ 
b/opennlp-brat-annotator/src/main/java/opennlp/bratann/NameFinderResource.java
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.bratann;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import jakarta.ws.rs.Consumes;
+import jakarta.ws.rs.POST;
+import jakarta.ws.rs.Path;
+import jakarta.ws.rs.Produces;
+import jakarta.ws.rs.QueryParam;
+import jakarta.ws.rs.core.MediaType;
+
+import opennlp.tools.namefind.TokenNameFinder;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.util.Span;
+
+@Path("/ner")
+public class NameFinderResource {
+
+  private final SentenceDetector sentDetect = 
NameFinderAnnService.sentenceDetector;
+  private final Tokenizer tokenizer = NameFinderAnnService.tokenizer;
+  private final TokenNameFinder[] nameFinders = 
NameFinderAnnService.nameFinders;
+
+  private static int findNextNonWhitespaceChar(CharSequence s, int 
beginOffset, int endOffset) {
+    for (int i = beginOffset; i < endOffset; i++) {
+      if (!Character.isSpaceChar(s.charAt(i))) {
+        return i;
+      }
+    }
+    return -1;
+  }
+
+  @POST
+  @Consumes(MediaType.TEXT_PLAIN)
+  @Produces(MediaType.APPLICATION_JSON)
+  public Map<String, NameAnn> findNames(@QueryParam("model") String modelName, 
String text) {
+    Span[] sentenceSpans = sentDetect.sentPosDetect(text);
+    Map<String, NameAnn> map = new HashMap<>();
+
+    int indexCounter = 0;
+
+    for (Span sentenceSpan : sentenceSpans) {
+
+      String sentenceText = sentenceSpan.getCoveredText(text).toString();
+
+      // offset of sentence gets lost here!
+      Span[] tokenSpans = tokenizer.tokenizePos(sentenceText);
+
+      String[] tokens = Span.spansToStrings(tokenSpans, sentenceText);
+
+      for (TokenNameFinder nameFinder : nameFinders) {
+        Span[] names = nameFinder.find(tokens);
+
+        for (Span name : names) {
+
+          int beginOffset = tokenSpans[name.getStart()].getStart() + 
sentenceSpan.getStart();
+          int endOffset = tokenSpans[name.getEnd() - 1].getEnd() + 
sentenceSpan.getStart();
+
+          // create a list of new line indexes
+          List<Integer> newLineIndexes = new ArrayList<>();
+
+          // TODO: Code needs to handle case that there are multiple new lines
+          // in a row
+
+          boolean inNewLineSequence = false;
+          for (int ci = beginOffset; ci < endOffset; ci++) {
+            if (text.charAt(ci) == '\n' || text.charAt(ci) == '\r') {
+              if (!inNewLineSequence) {
+                newLineIndexes.add(ci);
+              }
+              inNewLineSequence = true;
+            } else {
+              inNewLineSequence = false;
+            }
+          }
+
+          List<String> textSegments = new ArrayList<>();
+          List<int[]> spanSegments = new ArrayList<>();
+
+          int segmentBegin = beginOffset;
+
+          for (int newLineOffset : newLineIndexes) {
+            // create segment from begin to offset
+            textSegments.add(text.substring(segmentBegin, newLineOffset));
+            spanSegments.add(new int[] {segmentBegin, newLineOffset});
+
+            segmentBegin = findNextNonWhitespaceChar(text, newLineOffset + 1,
+                endOffset);
+
+            if (segmentBegin == -1) {
+              break;
+            }
+          }
+
+          // create left over segment
+          if (segmentBegin != -1) {
+            textSegments.add(text.substring(segmentBegin, endOffset));
+            spanSegments.add(new int[] {segmentBegin, endOffset});
+          }
+
+          NameAnn ann = new NameAnn();
+          ann.texts = textSegments.toArray(new String[0]);
+          ann.offsets = spanSegments.toArray(new int[spanSegments.size()][]);
+          ann.type = name.getType();
+
+          map.put(Integer.toString(indexCounter++), ann);
+        }
+      }
+    }
+    return map;
+  }
+
+  public static class NameAnn {
+    public int[][] offsets;
+    public String[] texts;
+    public String type;
+  }
+}
diff --git a/pom.xml b/pom.xml
index 6717b3a..18279d0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -99,6 +99,7 @@
         <module>mallet-addon</module>
         <module>modelbuilder-addon</module>
         <module>nlp-utils</module>
+        <module>opennlp-brat-annotator</module>
         <module>opennlp-coref</module>
         <module>opennlp-dl</module>
         <module>opennlp-similarity</module>

Reply via email to