kinow commented on code in PR #73: URL: https://github.com/apache/opennlp-sandbox/pull/73#discussion_r1083418556
########## tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/WordIndexerTest.java: ########## @@ -76,55 +76,53 @@ public void testToTokenIds_TwoSentences() { TokenIds ids = indexer.toTokenIds(collect.toArray(new String[2][])); - Assert.assertEquals(8, ids.getWordIds()[0].length); - Assert.assertEquals(12, ids.getWordIds()[1].length); - - Assert.assertArrayEquals(new int[] {4}, ids.getCharIds()[0][0]); - Assert.assertArrayEquals(new int[] {6, 82, 54, 76}, ids.getCharIds()[0][1]); - Assert.assertArrayEquals(new int[] {4}, ids.getCharIds()[0][2]); - Assert.assertArrayEquals(new int[] {6, 41, 54}, ids.getCharIds()[0][3]); - Assert.assertArrayEquals(new int[] {59, 34, 80, 31}, ids.getCharIds()[0][4]); - Assert.assertArrayEquals(new int[] {82, 31}, ids.getCharIds()[0][5]); - Assert.assertArrayEquals(new int[] {51, 34, 46, 83, 31, 76, 41, 28, 83, 31}, ids.getCharIds()[0][6]); - Assert.assertArrayEquals(new int[] {36, 83, 31, 42, 41, 80, 49}, ids.getCharIds()[0][7]); - - Assert.assertArrayEquals(new int[] {36, 34, 31, 41, 55, 23}, ids.getCharIds()[1][0]); - Assert.assertArrayEquals(new int[] {52, 80, 50, 42, 46}, ids.getCharIds()[1][1]); - Assert.assertArrayEquals(new int[] {23, 82, 83, 23}, ids.getCharIds()[1][2]); - Assert.assertArrayEquals(new int[] {34, 31}, ids.getCharIds()[1][3]); - Assert.assertArrayEquals(new int[] {76, 82, 54}, ids.getCharIds()[1][4]); - Assert.assertArrayEquals(new int[] {6, 41, 3}, ids.getCharIds()[1][5]); - Assert.assertArrayEquals(new int[] {30, 34}, ids.getCharIds()[1][6]); - Assert.assertArrayEquals(new int[] {52, 82, 11, 34, 55, 82}, ids.getCharIds()[1][7]); - Assert.assertArrayEquals(new int[] {74, 41, 80, 23, 83, 31, 54}, ids.getCharIds()[1][8]); - Assert.assertArrayEquals(new int[] {82, 31}, ids.getCharIds()[1][9]); - Assert.assertArrayEquals(new int[] {36, 83, 31, 42, 41, 80, 49}, ids.getCharIds()[1][10]); - Assert.assertArrayEquals(new int[] {65}, ids.getCharIds()[1][11]); - - Assert.assertEquals(21931, ids.getWordIds()[0][0]); - Assert.assertEquals(20473, ids.getWordIds()[0][1]); - Assert.assertEquals(21931, ids.getWordIds()[0][2]); - Assert.assertEquals(5477, ids.getWordIds()[0][3]); - Assert.assertEquals(11538, ids.getWordIds()[0][4]); - Assert.assertEquals(21341, ids.getWordIds()[0][5]); - Assert.assertEquals(14024, ids.getWordIds()[0][6]); - Assert.assertEquals(7420, ids.getWordIds()[0][7]); - - Assert.assertEquals(12492, ids.getWordIds()[1][0]); - Assert.assertEquals(2720, ids.getWordIds()[1][1]); - Assert.assertEquals(9476, ids.getWordIds()[1][2]); - Assert.assertEquals(16537, ids.getWordIds()[1][3]); - Assert.assertEquals(18966, ids.getWordIds()[1][4]); - Assert.assertEquals(21088, ids.getWordIds()[1][5]); - Assert.assertEquals(16601, ids.getWordIds()[1][6]); - Assert.assertEquals(2720, ids.getWordIds()[1][7]); - Assert.assertEquals(2720, ids.getWordIds()[1][8]); - Assert.assertEquals(21341, ids.getWordIds()[1][9]); - Assert.assertEquals(7420, ids.getWordIds()[1][10]); - Assert.assertEquals(2684, ids.getWordIds()[1][11]); - + assertEquals(8, ids.getWordIds()[0].length); + assertEquals(12, ids.getWordIds()[1].length); + + assertArrayEquals(new int[] {4}, ids.getCharIds()[0][0]); + assertArrayEquals(new int[] {6, 82, 54, 76}, ids.getCharIds()[0][1]); + assertArrayEquals(new int[] {4}, ids.getCharIds()[0][2]); + assertArrayEquals(new int[] {6, 41, 54}, ids.getCharIds()[0][3]); + assertArrayEquals(new int[] {59, 34, 80, 31}, ids.getCharIds()[0][4]); + assertArrayEquals(new int[] {82, 31}, ids.getCharIds()[0][5]); + assertArrayEquals(new int[] {51, 34, 46, 83, 31, 76, 41, 28, 83, 31}, ids.getCharIds()[0][6]); + assertArrayEquals(new int[] {36, 83, 31, 42, 41, 80, 49}, ids.getCharIds()[0][7]); + + assertArrayEquals(new int[] {36, 34, 31, 41, 55, 23}, ids.getCharIds()[1][0]); + assertArrayEquals(new int[] {52, 80, 50, 42, 46}, ids.getCharIds()[1][1]); + assertArrayEquals(new int[] {23, 82, 83, 23}, ids.getCharIds()[1][2]); + assertArrayEquals(new int[] {34, 31}, ids.getCharIds()[1][3]); + assertArrayEquals(new int[] {76, 82, 54}, ids.getCharIds()[1][4]); + assertArrayEquals(new int[] {6, 41, 3}, ids.getCharIds()[1][5]); + assertArrayEquals(new int[] {30, 34}, ids.getCharIds()[1][6]); + assertArrayEquals(new int[] {52, 82, 11, 34, 55, 82}, ids.getCharIds()[1][7]); + assertArrayEquals(new int[] {74, 41, 80, 23, 83, 31, 54}, ids.getCharIds()[1][8]); + assertArrayEquals(new int[] {82, 31}, ids.getCharIds()[1][9]); + assertArrayEquals(new int[] {36, 83, 31, 42, 41, 80, 49}, ids.getCharIds()[1][10]); + assertArrayEquals(new int[] {65}, ids.getCharIds()[1][11]); + + // TODO investigate why the 6 commented checks are different: Different data / assertions? Review Comment: :+1: really strange. Tried a few things locally but couldn't get these tests to pass. `master` branch broken too for me :shrug: (I get a NPE in the before class due to `.txt` vs `.txt.gz`, but fixing that gives me "java.lang.RuntimeException: Unknown word 'Stormy' is not allowed." again :dizzy_face: ) ########## tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/PredictTest.java: ########## @@ -1,31 +1,39 @@ package org.apache.opennlp.namefinder; -import java.io.IOException; +import org.junit.Ignore; +import org.junit.Test; import opennlp.tools.util.Span; +import java.io.IOException; +import java.nio.file.Path; + public class PredictTest { - public static void main(String[] args) throws IOException { + @Test @Ignore + // TODO This test is not platform neutral and, for instance, fails with: + // "Cannot find TensorFlow native library for OS: darwin, architecture: aarch64" + // We need JUnit 5 in the sandbox to circumvent this, so it can be run in supported environments Review Comment: :+1: ########## tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/WordIndexer.java: ########## @@ -36,32 +37,39 @@ public class WordIndexer { public static String UNK = "$UNK$"; public static String NUM = "$NUM$"; - private boolean lowerCase = false; - private boolean allowUnk = false; + private final boolean lowerCase = false; + private final boolean allowUnk = true; Review Comment: The tests in this component are really awkward. I was curious why you had to change from `false` to `true`. I opened the `words.txt` file and there's no `stormy` there, and with unknowns not allowed here I get a `java.lang.RuntimeException: Unknown word 'Stormy' is not allowed.`. Not sure how this test was passing before (cannot see any steamming/lemmatizer anywhere in the indexer). -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@opennlp.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org