Repository: opennlp
Updated Branches:
  refs/heads/master 406021733 -> 60595251e


OPENNLP-1035:Add unit tests and javadocs for BrownBigramFeatureGenerator, 
closes apache/opennlp#174


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/60595251
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/60595251
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/60595251

Branch: refs/heads/master
Commit: 60595251eec5979e14540c6d00043e24905a7404
Parents: 4060217
Author: jzonthemtn <[email protected]>
Authored: Tue Apr 25 08:05:49 2017 -0400
Committer: smarthi <[email protected]>
Committed: Tue Apr 25 08:05:49 2017 -0400

----------------------------------------------------------------------
 .../featuregen/BrownBigramFeatureGenerator.java |  20 +-
 .../BrownBigramFeatureGeneratorTest.java        |  87 +++
 .../opennlp/tools/formats/brown-cluster.txt     | 665 +++++++++++++++++++
 3 files changed, 764 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/60595251/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java
 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java
index 4f0a24a..f16ba97 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java
@@ -24,25 +24,30 @@ import java.util.List;
  */
 public class BrownBigramFeatureGenerator implements AdaptiveFeatureGenerator {
 
-  private BrownCluster brownLexicon;
-
-  public BrownBigramFeatureGenerator(BrownCluster dict) {
-    this.brownLexicon = dict;
+  private BrownCluster brownCluster;
+
+  /**
+   * Creates a new Brown Cluster bigram feature generator.
+   * @param brownCluster A {@link BrownCluster}.
+   */
+  public BrownBigramFeatureGenerator(BrownCluster brownCluster) {
+    this.brownCluster = brownCluster;
   }
 
+  @Override
   public void createFeatures(List<String> features, String[] tokens, int index,
       String[] previousOutcomes) {
 
-    List<String> wordClasses = BrownTokenClasses.getWordClasses(tokens[index], 
brownLexicon);
+    List<String> wordClasses = BrownTokenClasses.getWordClasses(tokens[index], 
brownCluster);
     if (index > 0) {
-      List<String> prevWordClasses = 
BrownTokenClasses.getWordClasses(tokens[index - 1], brownLexicon);
+      List<String> prevWordClasses = 
BrownTokenClasses.getWordClasses(tokens[index - 1], brownCluster);
       for (int i = 0; i < wordClasses.size() && i < prevWordClasses.size(); 
i++)
       features.add("p" + "browncluster" + "," + "browncluster" + "="
           + prevWordClasses.get(i) + "," + wordClasses.get(i));
     }
 
     if (index + 1 < tokens.length) {
-      List<String> nextWordClasses = 
BrownTokenClasses.getWordClasses(tokens[index + 1], brownLexicon);
+      List<String> nextWordClasses = 
BrownTokenClasses.getWordClasses(tokens[index + 1], brownCluster);
       for (int i = 0; i < wordClasses.size() && i < nextWordClasses.size(); 
i++) {
         features.add("browncluster" + "," + "n" + "browncluster" + "="
             + wordClasses.get(i) + "," + nextWordClasses.get(i));
@@ -51,4 +56,3 @@ public class BrownBigramFeatureGenerator implements 
AdaptiveFeatureGenerator {
   }
 
 }
-

http://git-wip-us.apache.org/repos/asf/opennlp/blob/60595251/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/BrownBigramFeatureGeneratorTest.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/BrownBigramFeatureGeneratorTest.java
 
b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/BrownBigramFeatureGeneratorTest.java
new file mode 100644
index 0000000..03810e8
--- /dev/null
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/BrownBigramFeatureGeneratorTest.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.featuregen;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+
+public class BrownBigramFeatureGeneratorTest {
+
+  private AdaptiveFeatureGenerator generator;
+  
+  @Before
+  public void setup() throws IOException {
+
+    ResourceAsStreamFactory stream = new ResourceAsStreamFactory(
+        getClass(), "/opennlp/tools/formats/brown-cluster.txt");
+
+    BrownCluster brownCluster = new BrownCluster(stream.createInputStream()); 
+    
+    generator = new BrownBigramFeatureGenerator(brownCluster);
+
+  }
+
+  @Test
+  public void createFeaturesTest() throws IOException {
+
+    String[] tokens = new String[] {"he", "went", "with", "you"};
+
+    List<String> features = new ArrayList<>();
+    generator.createFeatures(features, tokens, 3, null);
+
+    Assert.assertEquals(2, features.size());
+    
Assert.assertTrue(features.contains("pbrowncluster,browncluster=0101,0010"));
+    
Assert.assertTrue(features.contains("pbrowncluster,browncluster=01010,00101"));
+    
+  }
+  
+  @Test
+  public void createFeaturesSuccessiveTokensTest() throws IOException {
+
+    final String[] testSentence = new String[] {"he", "went", "with", "you", 
"in", "town"};
+
+    List<String> features = new ArrayList<>();
+    generator.createFeatures(features, testSentence, 3, null);
+
+    Assert.assertEquals(3, features.size());
+    
Assert.assertTrue(features.contains("pbrowncluster,browncluster=0101,0010"));
+    
Assert.assertTrue(features.contains("pbrowncluster,browncluster=01010,00101"));
+    
Assert.assertTrue(features.contains("browncluster,nbrowncluster=0010,0000"));
+    
+  }
+  
+  @Test
+  public void noFeaturesTest() throws IOException {
+
+    final String[] testSentence = new String[] {"he", "went", "with", "you"};
+
+    List<String> features = new ArrayList<>();
+    generator.createFeatures(features, testSentence, 0, null);
+
+    Assert.assertEquals(0, features.size());
+    
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/60595251/opennlp-tools/src/test/resources/opennlp/tools/formats/brown-cluster.txt
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/resources/opennlp/tools/formats/brown-cluster.txt 
b/opennlp-tools/src/test/resources/opennlp/tools/formats/brown-cluster.txt
new file mode 100644
index 0000000..df31bc7
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/brown-cluster.txt
@@ -0,0 +1,665 @@
+0000   18,     1
+0000   wedding 1
+0000   A       1
+0000   No,     1
+0000   prefered        1
+0000   hurry   1
+0000   address?        1
+0000   sounds  1
+0000   any     1
+0000   soon,   1
+0000   in      56
+0000   Worcesterstreet 1
+00010  summer. 1
+00010  56473   1
+00010  different       1
+00010  20193   1
+00010  Ulm     1
+00010  17818   1
+00010  beautiful       1
+00010  23213   1
+00010  12424   1
+00010  Rue-de-Grandes-Illusions        1
+00010  good.   1
+00010  Barmerstr.      1
+00010  81737   1
+00010  order   1
+00010  1912    1
+00010  63737   1
+00010  Chesterstr.     1
+00010  80333   1
+00010  81234   1
+00010  that's  1
+00010  78181   1
+00010  30291   1
+00010  84630   1
+00010  25334   1
+00010  30303   2
+00010  Leipzig.        2
+00010  your    3
+00010  her     10
+000110 5.      1
+000110 Hamburg,        1
+000110 contact 1
+000110 faked.  1
+000110 streetname      1
+000110 34.     1
+000110 83939   1
+000110 25.     1
+000110 2.      1
+000110 part-time       1
+000110 help-wanted     1
+000110 11      1
+000110 some    1
+000110 Gauting.        1
+000110 address.        1
+000110 parent's        1
+000110 reply.  1
+000110 touch   1
+000110 Berlin. 5
+000110 Munich. 5
+000111 there,  1
+000111 Schulz  1
+000111 Paris   1
+000111 Edinburgh,      1
+000111 day     1
+000111 1       1
+000111 you?    1
+000111 saw     1
+000111 see     1
+000111 house   1
+000111 recently        1
+000111 Don't   1
+000111 back    1
+000111 apartment       1
+000111 12,     1
+000111 Are     2
+000111 Could   2
+000111 did     2
+000111 job     2
+000111 still   3
+000111 Thank   3
+000111 up      3
+00100  30202.  1
+00100  Yesterday,      1
+00100  ad      1
+00100  homesick,       1
+00100  Now,    1
+00100  man     1
+00100  help.   1
+00100  area.   1
+00100  "Westbad".      1
+00100  or      2
+00100  It's    2
+00100  It      2
+00100  The     7
+00100  As      3
+00101  Arent't 1
+00101  offer.  1
+00101  celebrated      1
+00101  available.      1
+00101  spontaneously.  1
+00101  sounding        1
+00101  party   2
+00101  you     12
+001100 last    1
+001100 called, 1
+001100 That    1
+001100 life    1
+001100 pointed 1
+001100 building        1
+001100 restaurant      1
+001100 5,      1
+001100 one     1
+001100 interested      1
+001100 located 1
+001100 Please  1
+001100 answered        1
+001100 Hospital        1
+001100 112,    2
+001100 arrived 3
+001100 lived   4
+001100 lives   4
+001101 Unter-den-Linden        1
+001101 this    1
+001101 moment. 1
+001101 tip     1
+001101 10th    1
+001101 reckon. 1
+001101 factory 1
+001101 line    1
+001101 Paracelsus      1
+001101 Alan    1
+001101 it's    2
+001101 company 2
+001101 who     4
+001110 didn't  1
+001110 postcode        1
+001110 police  1
+001110 building.       1
+001110 concierge       1
+001110 flaring 1
+001110 finally 3
+001110 she     7
+001110 Last    4
+001110 She     5
+0011110        Erding, 1
+0011110        Spain,  1
+0011110        resident,       1
+0011110        lady,   1
+0011110        later   1
+0011110        business        1
+0011110        idea    1
+0011110        Berlin  1
+0011110        England,        1
+0011110        Sure,   1
+0011110        ,       10
+0011110        longer  1
+0011111        is.     1
+0011111        15      1
+0011111        Schneider       1
+0011111        Hinterhofer     1
+0011111        me.     1
+0011111        Our     1
+0011111        Seile   1
+0011111        Meier   1
+0011111        Bauer   1
+0011111        Sander  1
+0011111        Clara   1
+0011111        Schmidt 2
+0011111        minutes 2
+0011111        Miller  5
+0100   school  1
+0100   They    1
+0100   8       1
+0100   9       1
+0100   Europe. 1
+0100   those   1
+0100   Baumann,        1
+0100   a       38
+0100   high    1
+01010  About   1
+01010  has     1
+01010  us,     1
+01010  13,     1
+01010  university.     1
+01010  tell    1
+01010  On      2
+01010  than    2
+01010  An      2
+01010  Alisa   2
+01010  on      3
+01010  with    7
+01010  called  5
+01010  got     5
+01011  through 1
+01011  shoes?  1
+01011  city.   1
+01011  quickly 1
+01011  trauma, 1
+01011  situate 1
+01011  much!   1
+01011  then,   1
+01011  friday! 1
+01011  about   1
+01011  knew    2
+01011  of      17
+01011  him     3
+011000 drove   1
+011000 Yes,    1
+011000 away.   1
+011000 parents'        1
+011000 life-threatening,       1
+011000 Weilheim,       1
+011000 15.     1
+011000 33,     1
+011000 86th    1
+011000 1995.   1
+011000 apartment,      1
+011000 took    2
+011000 where   3
+011000 if      5
+011000 But     7
+011001 the     54
+011001 Blumenweg       1
+011010 problem 1
+011010 country 1
+011010 Her     1
+011010 rumour  1
+011010 middle-aged     1
+011010 police. 1
+011010 exhibition.     1
+011010 empty   1
+011010 hours   1
+011010 father  1
+011010 area    1
+011010 staff   1
+011010 Reichstag.      1
+011010 "Tapasbar"      1
+011010 to.     1
+011010 Lenbachhaus     1
+011010 complete        1
+011010 owner   1
+011010 1.      1
+011010 11,     1
+011010 15,     2
+011010 street  2
+011010 accident        2
+011010 Ostbahnhof      2
+011010 address 3
+0110110        help    1
+0110110        grateful        1
+0110110        singer  1
+0110110        new     1
+0110110        moment  1
+0110110        costumers       1
+0110110        ancestors.      1
+0110110        Schubert        1
+0110110        ups     1
+0110110        pedestrians.    1
+0110110        hint    1
+0110110        semester,       1
+0110110        aunt    1
+0110110        face-to-face,   1
+0110110        guests  1
+0110110        happy   1
+0110110        number  2
+0110110        6,      2
+0110110        name    8
+01101110       French  1
+01101110       Luise   1
+01101110       knowledge       1
+01101110       pictures        1
+01101110       them    2
+01101110       away    2
+01101110       out     4
+01101110       years   2
+01101111       pain,   1
+01101111       Is      1
+01101111       sign    1
+01101111       home,   1
+01101111       14,     1
+01101111       appreciated     1
+01101111       happened        1
+01101111       by      1
+01101111       point:  1
+01101111       opened  2
+01101111       near    4
+01101111       instantly       3
+01110  taxi    1
+01110  p.m.!   1
+01110  13      1
+01110  barbecue.       1
+01110  speed   1
+01110  tree.   1
+01110  tenant  1
+01110  metropolis      1
+01110  delivery        1
+01110  family  1
+01110  list    1
+01110  week.   1
+01110  student,        1
+01110  delicious       1
+01110  good    1
+01110  well-payed      1
+01110  student 1
+01110  person! 1
+01110  smaller 1
+01110  small   2
+01110  more    2
+01110  look    2
+01110  quite   2
+01110  bigger  2
+01110  young   2
+01110  tourist 2
+01110  great   3
+01110  letter  3
+01110  friend  4
+0111100        Elenor  1
+0111100        definitely      1
+0111100        Gina    1
+0111100        currently       1
+0111100        Marie   1
+0111100        McKennedy       1
+0111100        ten     1
+0111100        sometimes.      1
+0111100        Michael 1
+0111100        Michel  1
+0111100        competent       1
+0111100        Gerhard 1
+0111100        Stefanie        2
+0111100        five    2
+0111100        Mike    2
+0111100        Stefan  3
+0111101        particulary     1
+0111101        broken. 1
+0111101        10      1
+0111101        leather?        1
+0111101        grandaunt.      1
+0111101        90      1
+0111101        Julie   1
+0111101        badly   1
+0111101        you:    1
+0111101        July    1
+0111101        painfully       1
+0111101        founded 1
+0111101        Fernandes       1
+0111101        old     2
+0111101        elderly 2
+0111101        March   2
+0111101        him.    2
+0111101        2       2
+0111101        an      5
+0111110        6th     1
+0111110        Peter   1
+0111110        turbulent       1
+0111110        German  1
+0111110        informatics,    1
+0111110        phone   1
+0111110        October 1
+0111110        directly        1
+0111110        His     2
+0111110        My      4
+0111110        his     5
+0111110        our     5
+01111110       Oh      1
+01111110       mortal  1
+01111110       Natalie 1
+01111110       83454   1
+01111110       programming     1
+01111110       she's   2
+01111110       Hi      2
+01111110       that    9
+01111111       attention.      1
+01111111       central 1
+01111111       town.   1
+01111111       town    1
+01111111       Spanish 1
+01111111       lodge   1
+01111111       right   1
+01111111       married 2
+01111111       later,  2
+01111111       from    9
+01111111       local   2
+1000   information.    1
+1000   capital.        1
+1000   officer.        1
+1000   retired 1
+1000   most.   1
+1000   reception       1
+1000   wounds  1
+1000   12      1
+1000   personal        1
+1000   colour. 1
+1000   shoes   1
+1000   030/827234.     1
+1000   inquiries?      1
+1000   Brandenburger   1
+1000   computer...     1
+1000   underground     1
+1000   smalltown       1
+1000   city    2
+1000   only    2
+1000   first   4
+1000   home    3
+1000   woman   3
+1000   famous  4
+1001   multiple        1
+1001   France  1
+1001   care    1
+1001   burnt   1
+1001   birthday        1
+1001   there   2
+1001   they    3
+1001   it      8
+1001   He      4
+1001   which   4
+1010   Now     1
+1010   off     1
+1010   yes,    1
+1010   too.    1
+1010   and     30
+1010   56,     1
+10110  Euro,   1
+10110  Heidelberg.     1
+10110  countries,      1
+10110  injured.        1
+10110  widow.  1
+10110  danger. 1
+10110  fact    1
+10110  magazine.       1
+10110  12.     1
+10110  anniversary.    1
+10110  traditional     1
+10110  up,     1
+10110  that?   1
+10110  Fritsch.        1
+10110  amazing,        1
+10110  "Twentytwo".    1
+10110  am      1
+10110  Ottobrunn.      1
+10110  years.  1
+10110  her.    1
+10110  whom    2
+10110  Hamburg.        4
+10110  .       4
+10110  So      6
+10111  photo   1
+10111  place.  1
+10111  p.m..   1
+10111  Heidelberg's    1
+10111  September,      1
+10111  21,     1
+10111  jacket, 1
+10111  anyway, 1
+10111  Therefore,      1
+10111  couple, 1
+10111  so      2
+10111  When    2
+10111  year,   3
+10111  husband 2
+1100   place,  1
+1100   Convulsed       1
+1100   Driving 1
+1100   notable 1
+1100   album   1
+1100   meal.   1
+1100   I've    2
+1100   Hi,     2
+1100   We      2
+1100   I       37
+110100 takes   1
+110100 reported        1
+110100 is      15
+110100 wasn't  3
+110101 Bye!    1
+110101 He's    1
+110101 bike    1
+110101 can     1
+110101 agency  1
+110101 Highfly-Hotel   1
+110101 shop    1
+110101 "Daily's"       1
+110101 was     15
+110101 depended        1
+110110 Afterwards,     1
+110110 maps.   1
+110110 Lenbachhaus.    1
+110110 flair   1
+110110 immediately     1
+110110 weren't 1
+110110 addresses       1
+110110 desk    1
+110110 station 1
+110110 I'll    1
+110110 Tor     1
+110110 hospital        1
+110110 because 2
+110110 own     2
+110110 into    6
+110110 as      4
+1101110        frequented      1
+1101110        yet     1
+1101110        Since   1
+1101110        made    1
+1101110        what    1
+1101110        he      9
+1101110        information     2
+1101111        Italian.        1
+1101111        entertainer     1
+1101111        foreign 1
+1101111        delighted.      1
+1101111        George  3
+1101111        we      7
+111000 wrote   1
+111000 hadnt't 1
+111000 looking 1
+111000 just    1
+111000 realized        1
+111000 their   1
+111000 never   1
+111000 love    1
+111000 brought 2
+111000 really  2
+111000 heard   2
+111000 Although        2
+111000 like    7
+1110010        live    1
+1110010        don't   1
+1110010        injured 1
+1110010        first,  1
+1110010        hope    1
+1110010        want    1
+1110010        didn`t  1
+1110010        knows   1
+1110010        merely  1
+1110010        two     1
+1110010        worked  2
+1110010        tried   2
+1110010        no      2
+1110010        moved   4
+1110010        best    2
+1110011        need    1
+1110011        always  1
+1110011        alone   1
+1110011        liked   1
+1110011        forward 1
+1110011        proposed        1
+1110011        came    1
+1110011        talking 1
+1110011        pick    1
+1110011        told    2
+1110011        went    2
+1110011        decided 3
+1110011        wanted  3
+1110011        how     3
+1110011        have    4
+1110100        gave    1
+1110100        downs   1
+1110100        appartment      1
+1110100        hospital.       1
+1110100        last-minute.    1
+1110100        languages,      1
+1110100        sights, 1
+1110100        enjoyed 1
+1110100        I'm     6
+1110100        I'd     4
+1110101        felt    1
+1110101        flames  1
+1110101        enjoy   1
+1110101        deem    1
+1110101        called? 1
+1110101        hardly  1
+1110101        spent   1
+1110101        asked   2
+1110101        had     7
+1110101        found   3
+1110110        Munich, 1
+1110110        Scotland,       1
+1110110        day,    1
+1110110        study   1
+1110110        friend. 1
+1110110        after   1
+1110110        apartments      1
+1110110        show    1
+1110110        there.  1
+1110110        read    2
+1110110        get     3
+1110110        know    6
+1110111        right?  1
+1110111        soon    1
+1110111        uni.    1
+1110111        ambulance.      1
+1110111        Sunday  1
+1110111        before. 1
+1110111        possible.       1
+1110111        my      9
+1110111        he'd    2
+111100 you'll  1
+111100 ?       1
+111100 not     2
+111100 to      42
+111101 it.     1
+111101 call    1
+111101 One     1
+111101 Bruno   1
+111101 once    1
+111101 around  1
+111101 for     7
+111101 at      13
+1111100        Hauptbahnhof?   1
+1111100        hesitant        1
+1111100        visit   1
+1111100        completely      1
+1111100        start   1
+1111100        managed 1
+1111100        money   1
+1111100        go      1
+1111100        offered 1
+1111100        possible        1
+1111100        afford  1
+1111100        driver  2
+1111100        write   3
+1111100        easy    2
+1111101        relaxed 1
+1111101        simply  1
+1111101        sure.   1
+1111101        starts  1
+1111101        friendly        1
+1111101        give    1
+1111101        sitting 1
+1111101        going   1
+1111101        urgent  1
+1111101        please  2
+1111101        next    3
+1111101        very    6
+1111110        who's   1
+1111110        much,   1
+1111110        friday? 1
+1111110        explained       1
+1111110        met     1
+1111110        Where   1
+1111110        How     2
+1111110        much    2
+1111110        are     2
+1111110        could   2
+1111110        me      6
+1111110        enough  3
+1111111        seen    1
+1111111        papers  1
+1111111        "Mondnacht"     1
+1111111        both.   1
+1111111        crashed 1
+1111111        studies 1
+1111111        bring   1
+1111111        pull    1
+1111111        teacher 1
+1111111        boy     1
+1111111        far     1
+1111111        move    1
+1111111        travelling      1
+1111111        Yeah    2
+1111111        ring    2
+1111111        meet    2
+1111111        find    5
+1111111        be      3
\ No newline at end of file

Reply via email to