(opennlp) 01/01: OPENNLP-1633 - Remove dependency towards jackson-databind in opennlp-dl module

rzo1 Tue, 29 Oct 2024 04:55:53 -0700

This is an automated email from the ASF dual-hosted git repository.

rzo1 pushed a commit to branch 
OPENNLP-1633-Remove-dependency-towards-jackson-databind-in-opennlp-dl-module
in repository https://gitbox.apache.org/repos/asf/opennlp.git


commit efb9518d37b4a2fdaf5341df89d4869f8c18222d
Author: Richard Zowalla <[email protected]>
AuthorDate: Tue Oct 29 12:53:02 2024 +0100

    OPENNLP-1633 - Remove dependency towards jackson-databind in opennlp-dl 
module
---
 NOTICE                                             |   6 -
 opennlp-brat-annotator/pom.xml                     |   7 --
 opennlp-dl/pom.xml                                 |  12 --
 .../dl/doccat/DocumentCategorizerConfig.java       |  33 +++++-
 .../opennlp/dl/doccat/DocumentCategorizerDL.java   |  42 +++----
 .../dl/doccat/DocumentCategorizerConfigTest.java   | 125 +++++++++++++++++++++
 6 files changed, 169 insertions(+), 56 deletions(-)

diff --git a/NOTICE b/NOTICE
index f5f6c4fc..7c303c75 100644
--- a/NOTICE
+++ b/NOTICE
@@ -93,9 +93,3 @@ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 
HOLDERS BE
 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 OF CONTRACT, TORT OR OTHERWISE,  ARISING FROM, OUT OF OR IN CONNECTION
 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-============================================================================
-
-jackson-databind
-https://github.com/FasterXML/jackson-databind
-The Apache Software License, Version 2.0
diff --git a/opennlp-brat-annotator/pom.xml b/opennlp-brat-annotator/pom.xml
index 6a4b7b52..58426af5 100644
--- a/opennlp-brat-annotator/pom.xml
+++ b/opennlp-brat-annotator/pom.xml
@@ -61,13 +61,6 @@
                        <artifactId>jackson-databind</artifactId>
                        <version>${jackson.version}</version>
                        <scope>runtime</scope>
-                       <exclusions>
-                               <!-- Byte-Buddy became a dependency by accident 
- TODO remove it with update version > 2.17.0 -->
-                               <exclusion>
-                                       <groupId>net.bytebuddy</groupId>
-                                       <artifactId>byte-buddy</artifactId>
-                               </exclusion>
-                       </exclusions>
                </dependency>
 
                <dependency>
diff --git a/opennlp-dl/pom.xml b/opennlp-dl/pom.xml
index ab52d402..22e80a88 100644
--- a/opennlp-dl/pom.xml
+++ b/opennlp-dl/pom.xml
@@ -41,18 +41,6 @@
       <artifactId>onnxruntime</artifactId>
       <version>${onnxruntime.version}</version>
     </dependency>
-    <dependency>
-      <groupId>com.fasterxml.jackson.core</groupId>
-      <artifactId>jackson-databind</artifactId>
-      <version>${jackson.version}</version>
-      <exclusions>
-        <!-- Byte-Buddy became a dependency by accident - TODO remove it with 
update version > 2.17.0 -->
-        <exclusion>
-          <groupId>net.bytebuddy</groupId>
-          <artifactId>byte-buddy</artifactId>
-        </exclusion>
-      </exclusions>
-    </dependency>
     <dependency>
       <groupId>org.slf4j</groupId>
       <artifactId>slf4j-api</artifactId>
diff --git 
a/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerConfig.java 
b/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerConfig.java
index 8e6d04e2..218266e9 100644
--- a/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerConfig.java
+++ b/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerConfig.java
@@ -18,18 +18,39 @@
 package opennlp.dl.doccat;
 
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.Map;
+import java.util.Objects;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
-public class DocumentCategorizerConfig {
+public record DocumentCategorizerConfig(Map<String, String> id2label) {
 
-  private Map<String, String> id2label;
+  private static final Pattern ID_TO_LABEL_PATTERN = 
Pattern.compile("\"id2label\"\\s*:\\s*\\{(.*?)\\}", Pattern.DOTALL);
+  private static final Pattern ENTRY_PATTERN = 
Pattern.compile("\"(\\d+)\"\\s*:\\s*\"(.*?)\"");
 
-  public Map<String, String> getId2label() {
+  @Override
+  public Map<String, String> id2label() {
     return Collections.unmodifiableMap(id2label);
   }
 
-  public void setId2label(Map<String, String> id2label) {
-    this.id2label = id2label;
-  }
+  public static DocumentCategorizerConfig fromJson(String json) {
+    Objects.requireNonNull(json, "json must not be null");
+
+    final Map<String, String> id2label = new HashMap<>();
+    final Matcher matcher = ID_TO_LABEL_PATTERN.matcher(json);
+
+    if (matcher.find()) {
+      final String id2labelContent = matcher.group(1);
+      final Matcher entryMatcher = ENTRY_PATTERN.matcher(id2labelContent);
 
+      while (entryMatcher.find()) {
+        final String key = entryMatcher.group(1);
+        final String value = entryMatcher.group(2);
+        id2label.put(key, value);
+      }
+    }
+
+    return new DocumentCategorizerConfig(id2label);
+  }
 }
diff --git 
a/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java 
b/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java
index a06d4b09..822af6af 100644
--- a/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java
+++ b/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java
@@ -30,14 +30,13 @@ import java.util.Map;
 import java.util.Set;
 import java.util.SortedMap;
 import java.util.TreeMap;
+
 import java.util.stream.IntStream;
 
 import ai.onnxruntime.OnnxTensor;
 import ai.onnxruntime.OrtEnvironment;
 import ai.onnxruntime.OrtException;
 import ai.onnxruntime.OrtSession;
-import com.fasterxml.jackson.databind.DeserializationFeature;
-import com.fasterxml.jackson.databind.ObjectMapper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -67,16 +66,15 @@ public class DocumentCategorizerDL extends AbstractDL 
implements DocumentCategor
   /**
    * Instantiates a {@link DocumentCategorizer document categorizer} using 
ONNX models.
    *
-   * @param model The ONNX model file.
-   * @param vocabulary The model file's vocabulary file.
-   * @param categories The categories.
+   * @param model                         The ONNX model file.
+   * @param vocabulary                    The model file's vocabulary file.
+   * @param categories                    The categories.
    * @param classificationScoringStrategy Implementation of {@link 
ClassificationScoringStrategy} used
    *                                      to calculate the classification 
scores given the score of each
    *                                      individual document part.
-   * @param inferenceOptions {@link InferenceOptions} to control the inference.
-   *
+   * @param inferenceOptions              {@link InferenceOptions} to control 
the inference.
    * @throws OrtException Thrown if the {@code model} cannot be loaded.
-   * @throws IOException Thrown if errors occurred loading the {@code model} 
or {@code vocabulary}.
+   * @throws IOException  Thrown if errors occurred loading the {@code model} 
or {@code vocabulary}.
    */
   public DocumentCategorizerDL(File model, File vocabulary, Map<Integer, 
String> categories,
                                ClassificationScoringStrategy 
classificationScoringStrategy,
@@ -102,21 +100,20 @@ public class DocumentCategorizerDL extends AbstractDL 
implements DocumentCategor
   /**
    * Instantiates a {@link DocumentCategorizer document categorizer} using 
ONNX models.
    *
-   * @param model The ONNX model file.
-   * @param vocabulary The model file's vocabulary file.
-   * @param config The model's config file. The file will be used to determine 
the classification categories.
+   * @param model                         The ONNX model file.
+   * @param vocabulary                    The model file's vocabulary file.
+   * @param config                        The model's config file. The file 
will be used to determine the classification categories.
    * @param classificationScoringStrategy Implementation of {@link 
ClassificationScoringStrategy} used
    *                                      to calculate the classification 
scores given the score of each
    *                                      individual document part.
-   * @param inferenceOptions {@link InferenceOptions} to control the inference.
-   *
+   * @param inferenceOptions              {@link InferenceOptions} to control 
the inference.
    * @throws OrtException Thrown if the {@code model} cannot be loaded.
-   * @throws IOException Thrown if errors occurred loading the {@code model} 
or {@code vocabulary}.
+   * @throws IOException  Thrown if errors occurred loading the {@code model} 
or {@code vocabulary}.
    */
   public DocumentCategorizerDL(File model, File vocabulary, File config,
                                ClassificationScoringStrategy 
classificationScoringStrategy,
                                InferenceOptions inferenceOptions)
-          throws IOException, OrtException {
+      throws IOException, OrtException {
 
     this.env = OrtEnvironment.getEnvironment();
 
@@ -175,7 +172,7 @@ public class DocumentCategorizerDL extends AbstractDL 
implements DocumentCategor
       logger.error("Unload to perform document classification inference", ex);
     }
 
-    return new double[]{};
+    return new double[] {};
 
   }
 
@@ -315,6 +312,7 @@ public class DocumentCategorizerDL extends AbstractDL 
implements DocumentCategor
 
   /**
    * Applies softmax to an array of values.
+   *
    * @param input An array of values.
    * @return The output array.
    */
@@ -346,18 +344,12 @@ public class DocumentCategorizerDL extends AbstractDL 
implements DocumentCategor
   }
 
   private Map<Integer, String> readCategoriesFromFile(File config) throws 
IOException {
-
-    final String json = new String(Files.readAllBytes(config.toPath()));
-
-    final ObjectMapper objectMapper = new ObjectMapper();
-    objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, 
false);
-
     final DocumentCategorizerConfig documentCategorizerConfig =
-        objectMapper.readValue(json, DocumentCategorizerConfig.class);
+        DocumentCategorizerConfig.fromJson(new 
String(Files.readAllBytes(config.toPath())));
 
     final Map<Integer, String> categories = new HashMap<>();
-    for (final String key : documentCategorizerConfig.getId2label().keySet()) {
-      categories.put(Integer.valueOf(key), 
documentCategorizerConfig.getId2label().get(key));
+    for (final String key : documentCategorizerConfig.id2label().keySet()) {
+      categories.put(Integer.valueOf(key), 
documentCategorizerConfig.id2label().get(key));
     }
 
     return categories;
diff --git 
a/opennlp-dl/src/test/java/opennlp/dl/doccat/DocumentCategorizerConfigTest.java 
b/opennlp-dl/src/test/java/opennlp/dl/doccat/DocumentCategorizerConfigTest.java
new file mode 100644
index 00000000..a7ff5339
--- /dev/null
+++ 
b/opennlp-dl/src/test/java/opennlp/dl/doccat/DocumentCategorizerConfigTest.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.dl.doccat;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+
+public class DocumentCategorizerConfigTest {
+
+  @Test
+  public void testId2LabelsFromJsonPrettyValid() {
+    final String json = """
+        {
+          "_num_labels": 5,
+          "architectures": [
+            "BertForSequenceClassification"
+          ],
+          "attention_probs_dropout_prob": 0.1,
+          "directionality": "bidi",
+          "finetuning_task": "sentiment-analysis",
+          "hidden_act": "gelu",
+          "hidden_dropout_prob": 0.1,
+          "hidden_size": 768,
+          "id2label": {
+            "0": "1 star",
+            "1": "2 stars",
+            "2": "3 stars",
+            "3": "4 stars",
+            "4": "5 stars"
+          },
+          "initializer_range": 0.02,
+          "intermediate_size": 3072,
+          "label2id": {
+            "1 star": 0,
+            "2 stars": 1,
+            "3 stars": 2,
+            "4 stars": 3,
+            "5 stars": 4
+          },
+          "layer_norm_eps": 1e-12,
+          "max_position_embeddings": 512,
+          "model_type": "bert",
+          "num_attention_heads": 12,
+          "num_hidden_layers": 12,
+          "output_past": true,
+          "pad_token_id": 0,
+          "pooler_fc_size": 768,
+          "pooler_num_attention_heads": 12,
+          "pooler_num_fc_layers": 3,
+          "pooler_size_per_head": 128,
+          "pooler_type": "first_token_transform",
+          "type_vocab_size": 2,
+          "vocab_size": 105879
+        }
+        """;
+
+      final DocumentCategorizerConfig config = 
DocumentCategorizerConfig.fromJson(json);
+      assertNotNull(config);
+      assertEquals(5, config.id2label().size());
+      assertEquals("1 star", config.id2label().get("0"));
+      assertEquals("2 stars", config.id2label().get("1"));
+      assertEquals("3 stars", config.id2label().get("2"));
+      assertEquals("4 stars", config.id2label().get("3"));
+      assertEquals("5 stars", config.id2label().get("4"));
+  }
+
+  @Test
+  public void testId2LabelsFromJsonUglyValid() {
+    final String json = """
+   
{"_num_labels":5,"architectures":["BertForSequenceClassification"],"attention_probs_dropout_prob":0.1,"directionality":"bidi","finetuning_task":"sentiment-analysis",
+   
"hidden_act":"gelu","hidden_dropout_prob":0.1,"hidden_size":768,"id2label":{"0":"1
 star","1":"2 stars","2":"3 stars","3":"4 stars","4":"5 
stars"},"initializer_range":0.02,
+   "intermediate_size":3072,"label2id":{"1 star":0,"2 stars":1,"3 stars":2,"4 
stars":3,"5 
stars":4},"layer_norm_eps":1e-12,"max_position_embeddings":512,"model_type":"bert",
+   
"num_attention_heads":12,"num_hidden_layers":12,"output_past":true,"pad_token_id":0,"pooler_fc_size":768,"pooler_num_attention_heads":12,"pooler_num_fc_layers":3,
+   
"pooler_size_per_head":128,"pooler_type":"first_token_transform","type_vocab_size":2,"vocab_size":105879}
+   """;
+
+    final DocumentCategorizerConfig config = 
DocumentCategorizerConfig.fromJson(json);
+    assertNotNull(config);
+    assertEquals(5, config.id2label().size());
+    assertEquals("1 star", config.id2label().get("0"));
+    assertEquals("2 stars", config.id2label().get("1"));
+    assertEquals("3 stars", config.id2label().get("2"));
+    assertEquals("4 stars", config.id2label().get("3"));
+    assertEquals("5 stars", config.id2label().get("4"));
+  }
+
+  @Test
+  public void testId2LabelsFromJsonNoValues() {
+    final String json = """
+   
{"_num_labels":5,"architectures":["BertForSequenceClassification"],"attention_probs_dropout_prob":0.1,"directionality":"bidi","finetuning_task":"sentiment-analysis",
+   
"hidden_act":"gelu","hidden_dropout_prob":0.1,"hidden_size":768,"layer_norm_eps":1e-12,"max_position_embeddings":512,"model_type":"bert",
+   
"num_attention_heads":12,"num_hidden_layers":12,"output_past":true,"pad_token_id":0,"pooler_fc_size":768,"pooler_num_attention_heads":12,"pooler_num_fc_layers":3,
+   
"pooler_size_per_head":128,"pooler_type":"first_token_transform","type_vocab_size":2,"vocab_size":105879}
+   """;
+
+    final DocumentCategorizerConfig config = 
DocumentCategorizerConfig.fromJson(json);
+    assertNotNull(config);
+    assertEquals(0, config.id2label().size());
+  }
+
+  @Test
+  public void testId2LabelsFromJsonEmptyInput() {
+    final String json = "";
+    final DocumentCategorizerConfig config = 
DocumentCategorizerConfig.fromJson(json);
+    assertNotNull(config);
+    assertEquals(0, config.id2label().size());
+  }
+}

(opennlp) 01/01: OPENNLP-1633 - Remove dependency towards jackson-databind in opennlp-dl module

Reply via email to