Repository: opennlp Updated Branches: refs/heads/master 406021733 -> 60595251e
OPENNLP-1035:Add unit tests and javadocs for BrownBigramFeatureGenerator, closes apache/opennlp#174 Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/60595251 Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/60595251 Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/60595251 Branch: refs/heads/master Commit: 60595251eec5979e14540c6d00043e24905a7404 Parents: 4060217 Author: jzonthemtn <[email protected]> Authored: Tue Apr 25 08:05:49 2017 -0400 Committer: smarthi <[email protected]> Committed: Tue Apr 25 08:05:49 2017 -0400 ---------------------------------------------------------------------- .../featuregen/BrownBigramFeatureGenerator.java | 20 +- .../BrownBigramFeatureGeneratorTest.java | 87 +++ .../opennlp/tools/formats/brown-cluster.txt | 665 +++++++++++++++++++ 3 files changed, 764 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/60595251/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java index 4f0a24a..f16ba97 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java @@ -24,25 +24,30 @@ import java.util.List; */ public class BrownBigramFeatureGenerator implements AdaptiveFeatureGenerator { - private BrownCluster brownLexicon; - - public BrownBigramFeatureGenerator(BrownCluster dict) { - this.brownLexicon = dict; + private BrownCluster brownCluster; + + /** + * Creates a new Brown Cluster bigram feature generator. + * @param brownCluster A {@link BrownCluster}. + */ + public BrownBigramFeatureGenerator(BrownCluster brownCluster) { + this.brownCluster = brownCluster; } + @Override public void createFeatures(List<String> features, String[] tokens, int index, String[] previousOutcomes) { - List<String> wordClasses = BrownTokenClasses.getWordClasses(tokens[index], brownLexicon); + List<String> wordClasses = BrownTokenClasses.getWordClasses(tokens[index], brownCluster); if (index > 0) { - List<String> prevWordClasses = BrownTokenClasses.getWordClasses(tokens[index - 1], brownLexicon); + List<String> prevWordClasses = BrownTokenClasses.getWordClasses(tokens[index - 1], brownCluster); for (int i = 0; i < wordClasses.size() && i < prevWordClasses.size(); i++) features.add("p" + "browncluster" + "," + "browncluster" + "=" + prevWordClasses.get(i) + "," + wordClasses.get(i)); } if (index + 1 < tokens.length) { - List<String> nextWordClasses = BrownTokenClasses.getWordClasses(tokens[index + 1], brownLexicon); + List<String> nextWordClasses = BrownTokenClasses.getWordClasses(tokens[index + 1], brownCluster); for (int i = 0; i < wordClasses.size() && i < nextWordClasses.size(); i++) { features.add("browncluster" + "," + "n" + "browncluster" + "=" + wordClasses.get(i) + "," + nextWordClasses.get(i)); @@ -51,4 +56,3 @@ public class BrownBigramFeatureGenerator implements AdaptiveFeatureGenerator { } } - http://git-wip-us.apache.org/repos/asf/opennlp/blob/60595251/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/BrownBigramFeatureGeneratorTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/BrownBigramFeatureGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/BrownBigramFeatureGeneratorTest.java new file mode 100644 index 0000000..03810e8 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/BrownBigramFeatureGeneratorTest.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util.featuregen; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import opennlp.tools.formats.ResourceAsStreamFactory; + +public class BrownBigramFeatureGeneratorTest { + + private AdaptiveFeatureGenerator generator; + + @Before + public void setup() throws IOException { + + ResourceAsStreamFactory stream = new ResourceAsStreamFactory( + getClass(), "/opennlp/tools/formats/brown-cluster.txt"); + + BrownCluster brownCluster = new BrownCluster(stream.createInputStream()); + + generator = new BrownBigramFeatureGenerator(brownCluster); + + } + + @Test + public void createFeaturesTest() throws IOException { + + String[] tokens = new String[] {"he", "went", "with", "you"}; + + List<String> features = new ArrayList<>(); + generator.createFeatures(features, tokens, 3, null); + + Assert.assertEquals(2, features.size()); + Assert.assertTrue(features.contains("pbrowncluster,browncluster=0101,0010")); + Assert.assertTrue(features.contains("pbrowncluster,browncluster=01010,00101")); + + } + + @Test + public void createFeaturesSuccessiveTokensTest() throws IOException { + + final String[] testSentence = new String[] {"he", "went", "with", "you", "in", "town"}; + + List<String> features = new ArrayList<>(); + generator.createFeatures(features, testSentence, 3, null); + + Assert.assertEquals(3, features.size()); + Assert.assertTrue(features.contains("pbrowncluster,browncluster=0101,0010")); + Assert.assertTrue(features.contains("pbrowncluster,browncluster=01010,00101")); + Assert.assertTrue(features.contains("browncluster,nbrowncluster=0010,0000")); + + } + + @Test + public void noFeaturesTest() throws IOException { + + final String[] testSentence = new String[] {"he", "went", "with", "you"}; + + List<String> features = new ArrayList<>(); + generator.createFeatures(features, testSentence, 0, null); + + Assert.assertEquals(0, features.size()); + + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/60595251/opennlp-tools/src/test/resources/opennlp/tools/formats/brown-cluster.txt ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/brown-cluster.txt b/opennlp-tools/src/test/resources/opennlp/tools/formats/brown-cluster.txt new file mode 100644 index 0000000..df31bc7 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/brown-cluster.txt @@ -0,0 +1,665 @@ +0000 18, 1 +0000 wedding 1 +0000 A 1 +0000 No, 1 +0000 prefered 1 +0000 hurry 1 +0000 address? 1 +0000 sounds 1 +0000 any 1 +0000 soon, 1 +0000 in 56 +0000 Worcesterstreet 1 +00010 summer. 1 +00010 56473 1 +00010 different 1 +00010 20193 1 +00010 Ulm 1 +00010 17818 1 +00010 beautiful 1 +00010 23213 1 +00010 12424 1 +00010 Rue-de-Grandes-Illusions 1 +00010 good. 1 +00010 Barmerstr. 1 +00010 81737 1 +00010 order 1 +00010 1912 1 +00010 63737 1 +00010 Chesterstr. 1 +00010 80333 1 +00010 81234 1 +00010 that's 1 +00010 78181 1 +00010 30291 1 +00010 84630 1 +00010 25334 1 +00010 30303 2 +00010 Leipzig. 2 +00010 your 3 +00010 her 10 +000110 5. 1 +000110 Hamburg, 1 +000110 contact 1 +000110 faked. 1 +000110 streetname 1 +000110 34. 1 +000110 83939 1 +000110 25. 1 +000110 2. 1 +000110 part-time 1 +000110 help-wanted 1 +000110 11 1 +000110 some 1 +000110 Gauting. 1 +000110 address. 1 +000110 parent's 1 +000110 reply. 1 +000110 touch 1 +000110 Berlin. 5 +000110 Munich. 5 +000111 there, 1 +000111 Schulz 1 +000111 Paris 1 +000111 Edinburgh, 1 +000111 day 1 +000111 1 1 +000111 you? 1 +000111 saw 1 +000111 see 1 +000111 house 1 +000111 recently 1 +000111 Don't 1 +000111 back 1 +000111 apartment 1 +000111 12, 1 +000111 Are 2 +000111 Could 2 +000111 did 2 +000111 job 2 +000111 still 3 +000111 Thank 3 +000111 up 3 +00100 30202. 1 +00100 Yesterday, 1 +00100 ad 1 +00100 homesick, 1 +00100 Now, 1 +00100 man 1 +00100 help. 1 +00100 area. 1 +00100 "Westbad". 1 +00100 or 2 +00100 It's 2 +00100 It 2 +00100 The 7 +00100 As 3 +00101 Arent't 1 +00101 offer. 1 +00101 celebrated 1 +00101 available. 1 +00101 spontaneously. 1 +00101 sounding 1 +00101 party 2 +00101 you 12 +001100 last 1 +001100 called, 1 +001100 That 1 +001100 life 1 +001100 pointed 1 +001100 building 1 +001100 restaurant 1 +001100 5, 1 +001100 one 1 +001100 interested 1 +001100 located 1 +001100 Please 1 +001100 answered 1 +001100 Hospital 1 +001100 112, 2 +001100 arrived 3 +001100 lived 4 +001100 lives 4 +001101 Unter-den-Linden 1 +001101 this 1 +001101 moment. 1 +001101 tip 1 +001101 10th 1 +001101 reckon. 1 +001101 factory 1 +001101 line 1 +001101 Paracelsus 1 +001101 Alan 1 +001101 it's 2 +001101 company 2 +001101 who 4 +001110 didn't 1 +001110 postcode 1 +001110 police 1 +001110 building. 1 +001110 concierge 1 +001110 flaring 1 +001110 finally 3 +001110 she 7 +001110 Last 4 +001110 She 5 +0011110 Erding, 1 +0011110 Spain, 1 +0011110 resident, 1 +0011110 lady, 1 +0011110 later 1 +0011110 business 1 +0011110 idea 1 +0011110 Berlin 1 +0011110 England, 1 +0011110 Sure, 1 +0011110 , 10 +0011110 longer 1 +0011111 is. 1 +0011111 15 1 +0011111 Schneider 1 +0011111 Hinterhofer 1 +0011111 me. 1 +0011111 Our 1 +0011111 Seile 1 +0011111 Meier 1 +0011111 Bauer 1 +0011111 Sander 1 +0011111 Clara 1 +0011111 Schmidt 2 +0011111 minutes 2 +0011111 Miller 5 +0100 school 1 +0100 They 1 +0100 8 1 +0100 9 1 +0100 Europe. 1 +0100 those 1 +0100 Baumann, 1 +0100 a 38 +0100 high 1 +01010 About 1 +01010 has 1 +01010 us, 1 +01010 13, 1 +01010 university. 1 +01010 tell 1 +01010 On 2 +01010 than 2 +01010 An 2 +01010 Alisa 2 +01010 on 3 +01010 with 7 +01010 called 5 +01010 got 5 +01011 through 1 +01011 shoes? 1 +01011 city. 1 +01011 quickly 1 +01011 trauma, 1 +01011 situate 1 +01011 much! 1 +01011 then, 1 +01011 friday! 1 +01011 about 1 +01011 knew 2 +01011 of 17 +01011 him 3 +011000 drove 1 +011000 Yes, 1 +011000 away. 1 +011000 parents' 1 +011000 life-threatening, 1 +011000 Weilheim, 1 +011000 15. 1 +011000 33, 1 +011000 86th 1 +011000 1995. 1 +011000 apartment, 1 +011000 took 2 +011000 where 3 +011000 if 5 +011000 But 7 +011001 the 54 +011001 Blumenweg 1 +011010 problem 1 +011010 country 1 +011010 Her 1 +011010 rumour 1 +011010 middle-aged 1 +011010 police. 1 +011010 exhibition. 1 +011010 empty 1 +011010 hours 1 +011010 father 1 +011010 area 1 +011010 staff 1 +011010 Reichstag. 1 +011010 "Tapasbar" 1 +011010 to. 1 +011010 Lenbachhaus 1 +011010 complete 1 +011010 owner 1 +011010 1. 1 +011010 11, 1 +011010 15, 2 +011010 street 2 +011010 accident 2 +011010 Ostbahnhof 2 +011010 address 3 +0110110 help 1 +0110110 grateful 1 +0110110 singer 1 +0110110 new 1 +0110110 moment 1 +0110110 costumers 1 +0110110 ancestors. 1 +0110110 Schubert 1 +0110110 ups 1 +0110110 pedestrians. 1 +0110110 hint 1 +0110110 semester, 1 +0110110 aunt 1 +0110110 face-to-face, 1 +0110110 guests 1 +0110110 happy 1 +0110110 number 2 +0110110 6, 2 +0110110 name 8 +01101110 French 1 +01101110 Luise 1 +01101110 knowledge 1 +01101110 pictures 1 +01101110 them 2 +01101110 away 2 +01101110 out 4 +01101110 years 2 +01101111 pain, 1 +01101111 Is 1 +01101111 sign 1 +01101111 home, 1 +01101111 14, 1 +01101111 appreciated 1 +01101111 happened 1 +01101111 by 1 +01101111 point: 1 +01101111 opened 2 +01101111 near 4 +01101111 instantly 3 +01110 taxi 1 +01110 p.m.! 1 +01110 13 1 +01110 barbecue. 1 +01110 speed 1 +01110 tree. 1 +01110 tenant 1 +01110 metropolis 1 +01110 delivery 1 +01110 family 1 +01110 list 1 +01110 week. 1 +01110 student, 1 +01110 delicious 1 +01110 good 1 +01110 well-payed 1 +01110 student 1 +01110 person! 1 +01110 smaller 1 +01110 small 2 +01110 more 2 +01110 look 2 +01110 quite 2 +01110 bigger 2 +01110 young 2 +01110 tourist 2 +01110 great 3 +01110 letter 3 +01110 friend 4 +0111100 Elenor 1 +0111100 definitely 1 +0111100 Gina 1 +0111100 currently 1 +0111100 Marie 1 +0111100 McKennedy 1 +0111100 ten 1 +0111100 sometimes. 1 +0111100 Michael 1 +0111100 Michel 1 +0111100 competent 1 +0111100 Gerhard 1 +0111100 Stefanie 2 +0111100 five 2 +0111100 Mike 2 +0111100 Stefan 3 +0111101 particulary 1 +0111101 broken. 1 +0111101 10 1 +0111101 leather? 1 +0111101 grandaunt. 1 +0111101 90 1 +0111101 Julie 1 +0111101 badly 1 +0111101 you: 1 +0111101 July 1 +0111101 painfully 1 +0111101 founded 1 +0111101 Fernandes 1 +0111101 old 2 +0111101 elderly 2 +0111101 March 2 +0111101 him. 2 +0111101 2 2 +0111101 an 5 +0111110 6th 1 +0111110 Peter 1 +0111110 turbulent 1 +0111110 German 1 +0111110 informatics, 1 +0111110 phone 1 +0111110 October 1 +0111110 directly 1 +0111110 His 2 +0111110 My 4 +0111110 his 5 +0111110 our 5 +01111110 Oh 1 +01111110 mortal 1 +01111110 Natalie 1 +01111110 83454 1 +01111110 programming 1 +01111110 she's 2 +01111110 Hi 2 +01111110 that 9 +01111111 attention. 1 +01111111 central 1 +01111111 town. 1 +01111111 town 1 +01111111 Spanish 1 +01111111 lodge 1 +01111111 right 1 +01111111 married 2 +01111111 later, 2 +01111111 from 9 +01111111 local 2 +1000 information. 1 +1000 capital. 1 +1000 officer. 1 +1000 retired 1 +1000 most. 1 +1000 reception 1 +1000 wounds 1 +1000 12 1 +1000 personal 1 +1000 colour. 1 +1000 shoes 1 +1000 030/827234. 1 +1000 inquiries? 1 +1000 Brandenburger 1 +1000 computer... 1 +1000 underground 1 +1000 smalltown 1 +1000 city 2 +1000 only 2 +1000 first 4 +1000 home 3 +1000 woman 3 +1000 famous 4 +1001 multiple 1 +1001 France 1 +1001 care 1 +1001 burnt 1 +1001 birthday 1 +1001 there 2 +1001 they 3 +1001 it 8 +1001 He 4 +1001 which 4 +1010 Now 1 +1010 off 1 +1010 yes, 1 +1010 too. 1 +1010 and 30 +1010 56, 1 +10110 Euro, 1 +10110 Heidelberg. 1 +10110 countries, 1 +10110 injured. 1 +10110 widow. 1 +10110 danger. 1 +10110 fact 1 +10110 magazine. 1 +10110 12. 1 +10110 anniversary. 1 +10110 traditional 1 +10110 up, 1 +10110 that? 1 +10110 Fritsch. 1 +10110 amazing, 1 +10110 "Twentytwo". 1 +10110 am 1 +10110 Ottobrunn. 1 +10110 years. 1 +10110 her. 1 +10110 whom 2 +10110 Hamburg. 4 +10110 . 4 +10110 So 6 +10111 photo 1 +10111 place. 1 +10111 p.m.. 1 +10111 Heidelberg's 1 +10111 September, 1 +10111 21, 1 +10111 jacket, 1 +10111 anyway, 1 +10111 Therefore, 1 +10111 couple, 1 +10111 so 2 +10111 When 2 +10111 year, 3 +10111 husband 2 +1100 place, 1 +1100 Convulsed 1 +1100 Driving 1 +1100 notable 1 +1100 album 1 +1100 meal. 1 +1100 I've 2 +1100 Hi, 2 +1100 We 2 +1100 I 37 +110100 takes 1 +110100 reported 1 +110100 is 15 +110100 wasn't 3 +110101 Bye! 1 +110101 He's 1 +110101 bike 1 +110101 can 1 +110101 agency 1 +110101 Highfly-Hotel 1 +110101 shop 1 +110101 "Daily's" 1 +110101 was 15 +110101 depended 1 +110110 Afterwards, 1 +110110 maps. 1 +110110 Lenbachhaus. 1 +110110 flair 1 +110110 immediately 1 +110110 weren't 1 +110110 addresses 1 +110110 desk 1 +110110 station 1 +110110 I'll 1 +110110 Tor 1 +110110 hospital 1 +110110 because 2 +110110 own 2 +110110 into 6 +110110 as 4 +1101110 frequented 1 +1101110 yet 1 +1101110 Since 1 +1101110 made 1 +1101110 what 1 +1101110 he 9 +1101110 information 2 +1101111 Italian. 1 +1101111 entertainer 1 +1101111 foreign 1 +1101111 delighted. 1 +1101111 George 3 +1101111 we 7 +111000 wrote 1 +111000 hadnt't 1 +111000 looking 1 +111000 just 1 +111000 realized 1 +111000 their 1 +111000 never 1 +111000 love 1 +111000 brought 2 +111000 really 2 +111000 heard 2 +111000 Although 2 +111000 like 7 +1110010 live 1 +1110010 don't 1 +1110010 injured 1 +1110010 first, 1 +1110010 hope 1 +1110010 want 1 +1110010 didn`t 1 +1110010 knows 1 +1110010 merely 1 +1110010 two 1 +1110010 worked 2 +1110010 tried 2 +1110010 no 2 +1110010 moved 4 +1110010 best 2 +1110011 need 1 +1110011 always 1 +1110011 alone 1 +1110011 liked 1 +1110011 forward 1 +1110011 proposed 1 +1110011 came 1 +1110011 talking 1 +1110011 pick 1 +1110011 told 2 +1110011 went 2 +1110011 decided 3 +1110011 wanted 3 +1110011 how 3 +1110011 have 4 +1110100 gave 1 +1110100 downs 1 +1110100 appartment 1 +1110100 hospital. 1 +1110100 last-minute. 1 +1110100 languages, 1 +1110100 sights, 1 +1110100 enjoyed 1 +1110100 I'm 6 +1110100 I'd 4 +1110101 felt 1 +1110101 flames 1 +1110101 enjoy 1 +1110101 deem 1 +1110101 called? 1 +1110101 hardly 1 +1110101 spent 1 +1110101 asked 2 +1110101 had 7 +1110101 found 3 +1110110 Munich, 1 +1110110 Scotland, 1 +1110110 day, 1 +1110110 study 1 +1110110 friend. 1 +1110110 after 1 +1110110 apartments 1 +1110110 show 1 +1110110 there. 1 +1110110 read 2 +1110110 get 3 +1110110 know 6 +1110111 right? 1 +1110111 soon 1 +1110111 uni. 1 +1110111 ambulance. 1 +1110111 Sunday 1 +1110111 before. 1 +1110111 possible. 1 +1110111 my 9 +1110111 he'd 2 +111100 you'll 1 +111100 ? 1 +111100 not 2 +111100 to 42 +111101 it. 1 +111101 call 1 +111101 One 1 +111101 Bruno 1 +111101 once 1 +111101 around 1 +111101 for 7 +111101 at 13 +1111100 Hauptbahnhof? 1 +1111100 hesitant 1 +1111100 visit 1 +1111100 completely 1 +1111100 start 1 +1111100 managed 1 +1111100 money 1 +1111100 go 1 +1111100 offered 1 +1111100 possible 1 +1111100 afford 1 +1111100 driver 2 +1111100 write 3 +1111100 easy 2 +1111101 relaxed 1 +1111101 simply 1 +1111101 sure. 1 +1111101 starts 1 +1111101 friendly 1 +1111101 give 1 +1111101 sitting 1 +1111101 going 1 +1111101 urgent 1 +1111101 please 2 +1111101 next 3 +1111101 very 6 +1111110 who's 1 +1111110 much, 1 +1111110 friday? 1 +1111110 explained 1 +1111110 met 1 +1111110 Where 1 +1111110 How 2 +1111110 much 2 +1111110 are 2 +1111110 could 2 +1111110 me 6 +1111110 enough 3 +1111111 seen 1 +1111111 papers 1 +1111111 "Mondnacht" 1 +1111111 both. 1 +1111111 crashed 1 +1111111 studies 1 +1111111 bring 1 +1111111 pull 1 +1111111 teacher 1 +1111111 boy 1 +1111111 far 1 +1111111 move 1 +1111111 travelling 1 +1111111 Yeah 2 +1111111 ring 2 +1111111 meet 2 +1111111 find 5 +1111111 be 3 \ No newline at end of file
