kinow commented on code in PR #73:
URL: https://github.com/apache/opennlp-sandbox/pull/73#discussion_r1083418556


##########
tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/WordIndexerTest.java:
##########
@@ -76,55 +76,53 @@ public void testToTokenIds_TwoSentences() {
 
     TokenIds ids = indexer.toTokenIds(collect.toArray(new String[2][]));
 
-    Assert.assertEquals(8, ids.getWordIds()[0].length);
-    Assert.assertEquals(12, ids.getWordIds()[1].length);
-
-    Assert.assertArrayEquals(new int[] {4}, ids.getCharIds()[0][0]);
-    Assert.assertArrayEquals(new int[] {6, 82, 54, 76}, 
ids.getCharIds()[0][1]);
-    Assert.assertArrayEquals(new int[] {4}, ids.getCharIds()[0][2]);
-    Assert.assertArrayEquals(new int[] {6, 41, 54}, ids.getCharIds()[0][3]);
-    Assert.assertArrayEquals(new int[] {59, 34, 80, 31}, 
ids.getCharIds()[0][4]);
-    Assert.assertArrayEquals(new int[] {82, 31}, ids.getCharIds()[0][5]);
-    Assert.assertArrayEquals(new int[] {51, 34, 46, 83, 31, 76, 41, 28, 83, 
31}, ids.getCharIds()[0][6]);
-    Assert.assertArrayEquals(new int[] {36, 83, 31, 42, 41, 80, 49}, 
ids.getCharIds()[0][7]);
-
-    Assert.assertArrayEquals(new int[] {36, 34, 31, 41, 55, 23}, 
ids.getCharIds()[1][0]);
-    Assert.assertArrayEquals(new int[] {52, 80, 50, 42, 46}, 
ids.getCharIds()[1][1]);
-    Assert.assertArrayEquals(new int[] {23, 82, 83, 23}, 
ids.getCharIds()[1][2]);
-    Assert.assertArrayEquals(new int[] {34, 31}, ids.getCharIds()[1][3]);
-    Assert.assertArrayEquals(new int[] {76, 82, 54}, ids.getCharIds()[1][4]);
-    Assert.assertArrayEquals(new int[] {6, 41, 3}, ids.getCharIds()[1][5]);
-    Assert.assertArrayEquals(new int[] {30, 34}, ids.getCharIds()[1][6]);
-    Assert.assertArrayEquals(new int[] {52, 82, 11, 34, 55, 82}, 
ids.getCharIds()[1][7]);
-    Assert.assertArrayEquals(new int[] {74, 41, 80, 23, 83, 31, 54}, 
ids.getCharIds()[1][8]);
-    Assert.assertArrayEquals(new int[] {82, 31}, ids.getCharIds()[1][9]);
-    Assert.assertArrayEquals(new int[] {36, 83, 31, 42, 41, 80, 49}, 
ids.getCharIds()[1][10]);
-    Assert.assertArrayEquals(new int[] {65}, ids.getCharIds()[1][11]);
-
-    Assert.assertEquals(21931, ids.getWordIds()[0][0]);
-    Assert.assertEquals(20473, ids.getWordIds()[0][1]);
-    Assert.assertEquals(21931, ids.getWordIds()[0][2]);
-    Assert.assertEquals(5477, ids.getWordIds()[0][3]);
-    Assert.assertEquals(11538, ids.getWordIds()[0][4]);
-    Assert.assertEquals(21341, ids.getWordIds()[0][5]);
-    Assert.assertEquals(14024, ids.getWordIds()[0][6]);
-    Assert.assertEquals(7420, ids.getWordIds()[0][7]);
-
-    Assert.assertEquals(12492, ids.getWordIds()[1][0]);
-    Assert.assertEquals(2720, ids.getWordIds()[1][1]);
-    Assert.assertEquals(9476, ids.getWordIds()[1][2]);
-    Assert.assertEquals(16537, ids.getWordIds()[1][3]);
-    Assert.assertEquals(18966, ids.getWordIds()[1][4]);
-    Assert.assertEquals(21088, ids.getWordIds()[1][5]);
-    Assert.assertEquals(16601, ids.getWordIds()[1][6]);
-    Assert.assertEquals(2720, ids.getWordIds()[1][7]);
-    Assert.assertEquals(2720, ids.getWordIds()[1][8]);
-    Assert.assertEquals(21341, ids.getWordIds()[1][9]);
-    Assert.assertEquals(7420, ids.getWordIds()[1][10]);
-    Assert.assertEquals(2684, ids.getWordIds()[1][11]);
-
+    assertEquals(8, ids.getWordIds()[0].length);
+    assertEquals(12, ids.getWordIds()[1].length);
+
+    assertArrayEquals(new int[] {4}, ids.getCharIds()[0][0]);
+    assertArrayEquals(new int[] {6, 82, 54, 76}, ids.getCharIds()[0][1]);
+    assertArrayEquals(new int[] {4}, ids.getCharIds()[0][2]);
+    assertArrayEquals(new int[] {6, 41, 54}, ids.getCharIds()[0][3]);
+    assertArrayEquals(new int[] {59, 34, 80, 31}, ids.getCharIds()[0][4]);
+    assertArrayEquals(new int[] {82, 31}, ids.getCharIds()[0][5]);
+    assertArrayEquals(new int[] {51, 34, 46, 83, 31, 76, 41, 28, 83, 31}, 
ids.getCharIds()[0][6]);
+    assertArrayEquals(new int[] {36, 83, 31, 42, 41, 80, 49}, 
ids.getCharIds()[0][7]);
+
+    assertArrayEquals(new int[] {36, 34, 31, 41, 55, 23}, 
ids.getCharIds()[1][0]);
+    assertArrayEquals(new int[] {52, 80, 50, 42, 46}, ids.getCharIds()[1][1]);
+    assertArrayEquals(new int[] {23, 82, 83, 23}, ids.getCharIds()[1][2]);
+    assertArrayEquals(new int[] {34, 31}, ids.getCharIds()[1][3]);
+    assertArrayEquals(new int[] {76, 82, 54}, ids.getCharIds()[1][4]);
+    assertArrayEquals(new int[] {6, 41, 3}, ids.getCharIds()[1][5]);
+    assertArrayEquals(new int[] {30, 34}, ids.getCharIds()[1][6]);
+    assertArrayEquals(new int[] {52, 82, 11, 34, 55, 82}, 
ids.getCharIds()[1][7]);
+    assertArrayEquals(new int[] {74, 41, 80, 23, 83, 31, 54}, 
ids.getCharIds()[1][8]);
+    assertArrayEquals(new int[] {82, 31}, ids.getCharIds()[1][9]);
+    assertArrayEquals(new int[] {36, 83, 31, 42, 41, 80, 49}, 
ids.getCharIds()[1][10]);
+    assertArrayEquals(new int[] {65}, ids.getCharIds()[1][11]);
+
+    // TODO investigate why the 6 commented checks are different: Different 
data / assertions?

Review Comment:
   :+1:  really strange. Tried a few things locally but couldn't get these 
tests to pass. `master` branch broken too for me :shrug: (I get a NPE in the 
before class due to `.txt` vs `.txt.gz`, but fixing that gives me 
"java.lang.RuntimeException: Unknown word 'Stormy' is not allowed." again 
:dizzy_face: )



##########
tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/PredictTest.java:
##########
@@ -1,31 +1,39 @@
 package org.apache.opennlp.namefinder;
 
-import java.io.IOException;
+import org.junit.Ignore;
+import org.junit.Test;
 
 import opennlp.tools.util.Span;
 
+import java.io.IOException;
+import java.nio.file.Path;
+
 public class PredictTest {
 
-  public static void main(String[] args) throws IOException {
+  @Test @Ignore
+  // TODO This test is not platform neutral and, for instance, fails with:
+  //  "Cannot find TensorFlow native library for OS: darwin, architecture: 
aarch64"
+  //  We need JUnit 5 in the sandbox to circumvent this, so it can be run in 
supported environments

Review Comment:
   :+1: 



##########
tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/WordIndexer.java:
##########
@@ -36,32 +37,39 @@ public class WordIndexer {
   public static String UNK = "$UNK$";
   public static String NUM = "$NUM$";
 
-  private boolean lowerCase = false;
-  private boolean allowUnk = false;
+  private final boolean lowerCase = false;
+  private final boolean allowUnk = true;

Review Comment:
   The tests in this component are really awkward. I was curious why you had to 
change from `false` to `true`. I opened the `words.txt` file and there's no 
`stormy` there, and with unknowns not allowed here I get a 
`java.lang.RuntimeException: Unknown word 'Stormy' is not allowed.`. Not sure 
how this test was passing before (cannot see any steamming/lemmatizer anywhere 
in the indexer).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: dev-unsubscr...@opennlp.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to