Do you have any context around the locations in the training file? If not, You might be getting the entire sentence because you are basically training it to find a word.
Mg > On May 18, 2014, at 3:30 PM, lina abu jaradeh <[email protected]> wrote: > > I am trying to train en-ner-location.bin file using opennlp in java The thing > is i got the training text file in the following format <START:location> > Fontana <END> .<START:location> Palo Verde <END> . > <START:location> Picacho <END> . > I stored it in a text file called citytrain.txt the file contains 120770 line > of city namesthen i used the following code to train the fileimport > java.io.BufferedOutputStream; > import java.io.BufferedReader; > import java.io.File; > import java.io.FileInputStream; > import java.io.FileOutputStream; > import java.io.FileReader; > import java.io.IOException; > import java.io.InputStream; > import java.nio.charset.Charset; > import java.util.Collections; > > import opennlp.tools.namefind.NameFinderME; > import opennlp.tools.namefind.NameSample; > import opennlp.tools.namefind.NameSampleDataStream; > import opennlp.tools.namefind.TokenNameFinderModel; > import opennlp.tools.tokenize.Tokenizer; > import opennlp.tools.tokenize.TokenizerME; > import opennlp.tools.tokenize.TokenizerModel; > import opennlp.tools.util.ObjectStream; > import opennlp.tools.util.PlainTextByLineStream; > import opennlp.tools.util.Span; > import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator; > > public class TrainNames { > @SuppressWarnings("deprecation") > public void TrainNames() throws IOException{ > //File baseDir = new File("src/test/resources"); > //File destDir = new File("target"); > //<start id="ne-train"/> > File inFile = new File("citytrain.txt"); > NameSampleDataStream nss = new NameSampleDataStream( //<co > id="co.opennlp.name.initnamestream"/> > new PlainTextByLineStream( > new java.io.FileReader(inFile))); > > int iterations = 100; > int cutoff = 5; > TokenNameFinderModel model = NameFinderME.train( //<co > id="co.opennlp.name.train"/> > "en", // language > "location", // type > nss, > (AdaptiveFeatureGenerator) null, > Collections.<String,Object>emptyMap(), > iterations, > cutoff); > > File outFile = new File("en-ner-locationNews.bin"); > FileOutputStream outFileStream = new FileOutputStream(outFile); > model.serialize(outFileStream); > } > } > > when i used the output bi file to test it on a string to pull up Fontana from > a string the result returned the whole string....don't know why or what i am > doing wrongthe following code i used to extract Fontana from a stringimport > java.io.FileInputStream; > > import java.io.FileNotFoundException; > import java.io.IOException; > import java.io.InputStream; > import opennlp.tools.namefind.NameFinderME; > import opennlp.tools.namefind.TokenNameFinderModel; > import opennlp.tools.util.InvalidFormatException; > import opennlp.tools.util.Span; > import opennlp.tools.tokenize.Tokenizer; > import opennlp.tools.tokenize.TokenizerME; > import opennlp.tools.tokenize.TokenizerModel; > import opennlp.tools.tokenize.SimpleTokenizer; > import opennlp.tools.sentdetect.SentenceDetectorME; > import opennlp.tools.sentdetect.SentenceModel; > > import org.xml.sax.SAXException; > > > public class CityFinder { > > public String Tokens[]; > > public static void main(String[] args) throws IOException, SAXException { > > CityFinder toi = new CityFinder(); > String cnt; > cnt="John is planning to specialize in Electrical Engineering in UC > Fontana and pursue a career with IBM."; > toi.tokenization(cnt); > String cities = toi.namefind(toi.Tokens); > String org = toi.orgfind(toi.Tokens); > > System.out.println("City name is : "+cities); > System.out.println("organization name is: "+org); > > } > public String namefind(String cnt[]) { > InputStream is; > TokenNameFinderModel tnf; > NameFinderME nf; > String sd = ""; > try { > is = new FileInputStream("en-ner-locationNew.bin"); > tnf = new TokenNameFinderModel(is); > nf = new NameFinderME(tnf); > Span sp[] = nf.find(cnt); > String a[] = Span.spansToStrings(sp, cnt); > StringBuilder fd = new StringBuilder(); > int l = a.length; > > for (int j = 0; j < l; j++) { > fd = fd.append(a[j] + "\n"); > > } > sd = fd.toString(); > > } catch (FileNotFoundException e) { > > e.printStackTrace(); > } catch (InvalidFormatException e) { > > e.printStackTrace(); > } catch (IOException e) { > > e.printStackTrace(); > } > return sd; > } > > public String orgfind(String cnt[]) { > InputStream is; > TokenNameFinderModel tnf; > NameFinderME nf; > String sd = ""; > try { > is = new FileInputStream("en-ner-organization.bin"); > tnf = new TokenNameFinderModel(is); > nf = new NameFinderME(tnf); > Span sp[] = nf.find(cnt); > String a[] = Span.spansToStrings(sp, cnt); > StringBuilder fd = new StringBuilder(); > int l = a.length; > for (int j = 0; j < l; j++) { > fd = fd.append(a[j] + "\n"); > > } > > sd = fd.toString(); > > } catch (FileNotFoundException e) { > > e.printStackTrace(); > } catch (InvalidFormatException e) { > > e.printStackTrace(); > } catch (IOException e) { > > e.printStackTrace(); > } > return sd; > > } > public void tokenization(String tokens) { > > InputStream is; > TokenizerModel tm; > try { > is = new FileInputStream("en-token.bin"); > tm = new TokenizerModel(is); > Tokenizer tz = new TokenizerME(tm); > Tokens = tz.tokenize(tokens); > // System.out.println(Tokens[1]); > } catch (IOException e) { > e.printStackTrace(); > } > } > > } > can you please let me know where i did wrong...??
