This is an automated email from the ASF dual-hosted git repository.

rzo1 pushed a commit to branch 
OPENNLP-1633-Remove-dependency-towards-jackson-databind-in-opennlp-dl-module
in repository https://gitbox.apache.org/repos/asf/opennlp.git

commit e700f4d7212d25a61f0afa57783a70c7701bbca2
Author: Richard Zowalla <[email protected]>
AuthorDate: Tue Oct 29 12:53:02 2024 +0100

    OPENNLP-1633 - Remove dependency towards jackson-databind in opennlp-dl 
module
---
 opennlp-brat-annotator/pom.xml                     |   7 --
 opennlp-dl/pom.xml                                 |  12 --
 .../dl/doccat/DocumentCategorizerConfig.java       |  33 +++++-
 .../opennlp/dl/doccat/DocumentCategorizerDL.java   |  42 +++----
 .../dl/doccat/DocumentCategorizerConfigTest.java   | 125 +++++++++++++++++++++
 5 files changed, 169 insertions(+), 50 deletions(-)

diff --git a/opennlp-brat-annotator/pom.xml b/opennlp-brat-annotator/pom.xml
index 6a4b7b52..58426af5 100644
--- a/opennlp-brat-annotator/pom.xml
+++ b/opennlp-brat-annotator/pom.xml
@@ -61,13 +61,6 @@
                        <artifactId>jackson-databind</artifactId>
                        <version>${jackson.version}</version>
                        <scope>runtime</scope>
-                       <exclusions>
-                               <!-- Byte-Buddy became a dependency by accident 
- TODO remove it with update version > 2.17.0 -->
-                               <exclusion>
-                                       <groupId>net.bytebuddy</groupId>
-                                       <artifactId>byte-buddy</artifactId>
-                               </exclusion>
-                       </exclusions>
                </dependency>
 
                <dependency>
diff --git a/opennlp-dl/pom.xml b/opennlp-dl/pom.xml
index ab52d402..22e80a88 100644
--- a/opennlp-dl/pom.xml
+++ b/opennlp-dl/pom.xml
@@ -41,18 +41,6 @@
       <artifactId>onnxruntime</artifactId>
       <version>${onnxruntime.version}</version>
     </dependency>
-    <dependency>
-      <groupId>com.fasterxml.jackson.core</groupId>
-      <artifactId>jackson-databind</artifactId>
-      <version>${jackson.version}</version>
-      <exclusions>
-        <!-- Byte-Buddy became a dependency by accident - TODO remove it with 
update version > 2.17.0 -->
-        <exclusion>
-          <groupId>net.bytebuddy</groupId>
-          <artifactId>byte-buddy</artifactId>
-        </exclusion>
-      </exclusions>
-    </dependency>
     <dependency>
       <groupId>org.slf4j</groupId>
       <artifactId>slf4j-api</artifactId>
diff --git 
a/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerConfig.java 
b/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerConfig.java
index 8e6d04e2..218266e9 100644
--- a/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerConfig.java
+++ b/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerConfig.java
@@ -18,18 +18,39 @@
 package opennlp.dl.doccat;
 
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.Map;
+import java.util.Objects;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
-public class DocumentCategorizerConfig {
+public record DocumentCategorizerConfig(Map<String, String> id2label) {
 
-  private Map<String, String> id2label;
+  private static final Pattern ID_TO_LABEL_PATTERN = 
Pattern.compile("\"id2label\"\\s*:\\s*\\{(.*?)\\}", Pattern.DOTALL);
+  private static final Pattern ENTRY_PATTERN = 
Pattern.compile("\"(\\d+)\"\\s*:\\s*\"(.*?)\"");
 
-  public Map<String, String> getId2label() {
+  @Override
+  public Map<String, String> id2label() {
     return Collections.unmodifiableMap(id2label);
   }
 
-  public void setId2label(Map<String, String> id2label) {
-    this.id2label = id2label;
-  }
+  public static DocumentCategorizerConfig fromJson(String json) {
+    Objects.requireNonNull(json, "json must not be null");
+
+    final Map<String, String> id2label = new HashMap<>();
+    final Matcher matcher = ID_TO_LABEL_PATTERN.matcher(json);
+
+    if (matcher.find()) {
+      final String id2labelContent = matcher.group(1);
+      final Matcher entryMatcher = ENTRY_PATTERN.matcher(id2labelContent);
 
+      while (entryMatcher.find()) {
+        final String key = entryMatcher.group(1);
+        final String value = entryMatcher.group(2);
+        id2label.put(key, value);
+      }
+    }
+
+    return new DocumentCategorizerConfig(id2label);
+  }
 }
diff --git 
a/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java 
b/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java
index a06d4b09..822af6af 100644
--- a/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java
+++ b/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java
@@ -30,14 +30,13 @@ import java.util.Map;
 import java.util.Set;
 import java.util.SortedMap;
 import java.util.TreeMap;
+
 import java.util.stream.IntStream;
 
 import ai.onnxruntime.OnnxTensor;
 import ai.onnxruntime.OrtEnvironment;
 import ai.onnxruntime.OrtException;
 import ai.onnxruntime.OrtSession;
-import com.fasterxml.jackson.databind.DeserializationFeature;
-import com.fasterxml.jackson.databind.ObjectMapper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -67,16 +66,15 @@ public class DocumentCategorizerDL extends AbstractDL 
implements DocumentCategor
   /**
    * Instantiates a {@link DocumentCategorizer document categorizer} using 
ONNX models.
    *
-   * @param model The ONNX model file.
-   * @param vocabulary The model file's vocabulary file.
-   * @param categories The categories.
+   * @param model                         The ONNX model file.
+   * @param vocabulary                    The model file's vocabulary file.
+   * @param categories                    The categories.
    * @param classificationScoringStrategy Implementation of {@link 
ClassificationScoringStrategy} used
    *                                      to calculate the classification 
scores given the score of each
    *                                      individual document part.
-   * @param inferenceOptions {@link InferenceOptions} to control the inference.
-   *
+   * @param inferenceOptions              {@link InferenceOptions} to control 
the inference.
    * @throws OrtException Thrown if the {@code model} cannot be loaded.
-   * @throws IOException Thrown if errors occurred loading the {@code model} 
or {@code vocabulary}.
+   * @throws IOException  Thrown if errors occurred loading the {@code model} 
or {@code vocabulary}.
    */
   public DocumentCategorizerDL(File model, File vocabulary, Map<Integer, 
String> categories,
                                ClassificationScoringStrategy 
classificationScoringStrategy,
@@ -102,21 +100,20 @@ public class DocumentCategorizerDL extends AbstractDL 
implements DocumentCategor
   /**
    * Instantiates a {@link DocumentCategorizer document categorizer} using 
ONNX models.
    *
-   * @param model The ONNX model file.
-   * @param vocabulary The model file's vocabulary file.
-   * @param config The model's config file. The file will be used to determine 
the classification categories.
+   * @param model                         The ONNX model file.
+   * @param vocabulary                    The model file's vocabulary file.
+   * @param config                        The model's config file. The file 
will be used to determine the classification categories.
    * @param classificationScoringStrategy Implementation of {@link 
ClassificationScoringStrategy} used
    *                                      to calculate the classification 
scores given the score of each
    *                                      individual document part.
-   * @param inferenceOptions {@link InferenceOptions} to control the inference.
-   *
+   * @param inferenceOptions              {@link InferenceOptions} to control 
the inference.
    * @throws OrtException Thrown if the {@code model} cannot be loaded.
-   * @throws IOException Thrown if errors occurred loading the {@code model} 
or {@code vocabulary}.
+   * @throws IOException  Thrown if errors occurred loading the {@code model} 
or {@code vocabulary}.
    */
   public DocumentCategorizerDL(File model, File vocabulary, File config,
                                ClassificationScoringStrategy 
classificationScoringStrategy,
                                InferenceOptions inferenceOptions)
-          throws IOException, OrtException {
+      throws IOException, OrtException {
 
     this.env = OrtEnvironment.getEnvironment();
 
@@ -175,7 +172,7 @@ public class DocumentCategorizerDL extends AbstractDL 
implements DocumentCategor
       logger.error("Unload to perform document classification inference", ex);
     }
 
-    return new double[]{};
+    return new double[] {};
 
   }
 
@@ -315,6 +312,7 @@ public class DocumentCategorizerDL extends AbstractDL 
implements DocumentCategor
 
   /**
    * Applies softmax to an array of values.
+   *
    * @param input An array of values.
    * @return The output array.
    */
@@ -346,18 +344,12 @@ public class DocumentCategorizerDL extends AbstractDL 
implements DocumentCategor
   }
 
   private Map<Integer, String> readCategoriesFromFile(File config) throws 
IOException {
-
-    final String json = new String(Files.readAllBytes(config.toPath()));
-
-    final ObjectMapper objectMapper = new ObjectMapper();
-    objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, 
false);
-
     final DocumentCategorizerConfig documentCategorizerConfig =
-        objectMapper.readValue(json, DocumentCategorizerConfig.class);
+        DocumentCategorizerConfig.fromJson(new 
String(Files.readAllBytes(config.toPath())));
 
     final Map<Integer, String> categories = new HashMap<>();
-    for (final String key : documentCategorizerConfig.getId2label().keySet()) {
-      categories.put(Integer.valueOf(key), 
documentCategorizerConfig.getId2label().get(key));
+    for (final String key : documentCategorizerConfig.id2label().keySet()) {
+      categories.put(Integer.valueOf(key), 
documentCategorizerConfig.id2label().get(key));
     }
 
     return categories;
diff --git 
a/opennlp-dl/src/test/java/opennlp/dl/doccat/DocumentCategorizerConfigTest.java 
b/opennlp-dl/src/test/java/opennlp/dl/doccat/DocumentCategorizerConfigTest.java
new file mode 100644
index 00000000..a7ff5339
--- /dev/null
+++ 
b/opennlp-dl/src/test/java/opennlp/dl/doccat/DocumentCategorizerConfigTest.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.dl.doccat;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+
+public class DocumentCategorizerConfigTest {
+
+  @Test
+  public void testId2LabelsFromJsonPrettyValid() {
+    final String json = """
+        {
+          "_num_labels": 5,
+          "architectures": [
+            "BertForSequenceClassification"
+          ],
+          "attention_probs_dropout_prob": 0.1,
+          "directionality": "bidi",
+          "finetuning_task": "sentiment-analysis",
+          "hidden_act": "gelu",
+          "hidden_dropout_prob": 0.1,
+          "hidden_size": 768,
+          "id2label": {
+            "0": "1 star",
+            "1": "2 stars",
+            "2": "3 stars",
+            "3": "4 stars",
+            "4": "5 stars"
+          },
+          "initializer_range": 0.02,
+          "intermediate_size": 3072,
+          "label2id": {
+            "1 star": 0,
+            "2 stars": 1,
+            "3 stars": 2,
+            "4 stars": 3,
+            "5 stars": 4
+          },
+          "layer_norm_eps": 1e-12,
+          "max_position_embeddings": 512,
+          "model_type": "bert",
+          "num_attention_heads": 12,
+          "num_hidden_layers": 12,
+          "output_past": true,
+          "pad_token_id": 0,
+          "pooler_fc_size": 768,
+          "pooler_num_attention_heads": 12,
+          "pooler_num_fc_layers": 3,
+          "pooler_size_per_head": 128,
+          "pooler_type": "first_token_transform",
+          "type_vocab_size": 2,
+          "vocab_size": 105879
+        }
+        """;
+
+      final DocumentCategorizerConfig config = 
DocumentCategorizerConfig.fromJson(json);
+      assertNotNull(config);
+      assertEquals(5, config.id2label().size());
+      assertEquals("1 star", config.id2label().get("0"));
+      assertEquals("2 stars", config.id2label().get("1"));
+      assertEquals("3 stars", config.id2label().get("2"));
+      assertEquals("4 stars", config.id2label().get("3"));
+      assertEquals("5 stars", config.id2label().get("4"));
+  }
+
+  @Test
+  public void testId2LabelsFromJsonUglyValid() {
+    final String json = """
+   
{"_num_labels":5,"architectures":["BertForSequenceClassification"],"attention_probs_dropout_prob":0.1,"directionality":"bidi","finetuning_task":"sentiment-analysis",
+   
"hidden_act":"gelu","hidden_dropout_prob":0.1,"hidden_size":768,"id2label":{"0":"1
 star","1":"2 stars","2":"3 stars","3":"4 stars","4":"5 
stars"},"initializer_range":0.02,
+   "intermediate_size":3072,"label2id":{"1 star":0,"2 stars":1,"3 stars":2,"4 
stars":3,"5 
stars":4},"layer_norm_eps":1e-12,"max_position_embeddings":512,"model_type":"bert",
+   
"num_attention_heads":12,"num_hidden_layers":12,"output_past":true,"pad_token_id":0,"pooler_fc_size":768,"pooler_num_attention_heads":12,"pooler_num_fc_layers":3,
+   
"pooler_size_per_head":128,"pooler_type":"first_token_transform","type_vocab_size":2,"vocab_size":105879}
+   """;
+
+    final DocumentCategorizerConfig config = 
DocumentCategorizerConfig.fromJson(json);
+    assertNotNull(config);
+    assertEquals(5, config.id2label().size());
+    assertEquals("1 star", config.id2label().get("0"));
+    assertEquals("2 stars", config.id2label().get("1"));
+    assertEquals("3 stars", config.id2label().get("2"));
+    assertEquals("4 stars", config.id2label().get("3"));
+    assertEquals("5 stars", config.id2label().get("4"));
+  }
+
+  @Test
+  public void testId2LabelsFromJsonNoValues() {
+    final String json = """
+   
{"_num_labels":5,"architectures":["BertForSequenceClassification"],"attention_probs_dropout_prob":0.1,"directionality":"bidi","finetuning_task":"sentiment-analysis",
+   
"hidden_act":"gelu","hidden_dropout_prob":0.1,"hidden_size":768,"layer_norm_eps":1e-12,"max_position_embeddings":512,"model_type":"bert",
+   
"num_attention_heads":12,"num_hidden_layers":12,"output_past":true,"pad_token_id":0,"pooler_fc_size":768,"pooler_num_attention_heads":12,"pooler_num_fc_layers":3,
+   
"pooler_size_per_head":128,"pooler_type":"first_token_transform","type_vocab_size":2,"vocab_size":105879}
+   """;
+
+    final DocumentCategorizerConfig config = 
DocumentCategorizerConfig.fromJson(json);
+    assertNotNull(config);
+    assertEquals(0, config.id2label().size());
+  }
+
+  @Test
+  public void testId2LabelsFromJsonEmptyInput() {
+    final String json = "";
+    final DocumentCategorizerConfig config = 
DocumentCategorizerConfig.fromJson(json);
+    assertNotNull(config);
+    assertEquals(0, config.id2label().size());
+  }
+}

Reply via email to