I am trying to train en-ner-location.bin file using opennlp in java The thing 
is i got the training text file in the following format <START:location> 
Fontana <END> .<START:location> Palo Verde <END> .
<START:location> Picacho <END> .
I stored it in a text file called citytrain.txt the file contains 120770 line 
of city namesthen i used the following code to train the fileimport 
java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Collections;

import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.namefind.NameSampleDataStream;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;

public class TrainNames {        
                @SuppressWarnings("deprecation")
                public void TrainNames() throws IOException{
                        //File baseDir = new File("src/test/resources");
                    //File destDir = new File("target");
                    //<start id="ne-train"/>
                    File inFile = new File("citytrain.txt");
                    NameSampleDataStream nss = new NameSampleDataStream( //<co 
id="co.opennlp.name.initnamestream"/>
                      new PlainTextByLineStream(
                        new java.io.FileReader(inFile)));

                    int iterations = 100;
                    int cutoff = 5;
                    TokenNameFinderModel model = NameFinderME.train( //<co 
id="co.opennlp.name.train"/>
                        "en", // language
                        "location", // type
                        nss, 
                        (AdaptiveFeatureGenerator) null,
                        Collections.<String,Object>emptyMap(),
                        iterations,
                        cutoff);
                    
                    File outFile = new File("en-ner-locationNews.bin");
                    FileOutputStream outFileStream = new 
FileOutputStream(outFile);
                    model.serialize(outFileStream);
                }
}

when i used the output bi file to test it on a string to pull up Fontana from a 
string the result returned the whole string....don't know why or what i am 
doing wrongthe following code i used to extract Fontana from a stringimport 
java.io.FileInputStream;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.Span;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;

import org.xml.sax.SAXException;


public class CityFinder {

    public String Tokens[];

    public static void main(String[] args) throws IOException, SAXException {

        CityFinder toi = new CityFinder();
        String cnt;
        cnt="John is planning to specialize in Electrical Engineering in UC 
Fontana and pursue a career with IBM.";
        toi.tokenization(cnt);
        String cities = toi.namefind(toi.Tokens);
        String org = toi.orgfind(toi.Tokens);

        System.out.println("City name is : "+cities);
        System.out.println("organization name is: "+org);

    }
        public String namefind(String cnt[]) {
        InputStream is;
        TokenNameFinderModel tnf;
        NameFinderME nf;
        String sd = "";
        try {
            is = new FileInputStream("en-ner-locationNew.bin");
            tnf = new TokenNameFinderModel(is);
            nf = new NameFinderME(tnf);
            Span sp[] = nf.find(cnt);
            String a[] = Span.spansToStrings(sp, cnt);
            StringBuilder fd = new StringBuilder();
            int l = a.length;

            for (int j = 0; j < l; j++) {
                fd = fd.append(a[j] + "\n");

            }
            sd = fd.toString();

        } catch (FileNotFoundException e) {

            e.printStackTrace();
        } catch (InvalidFormatException e) {

            e.printStackTrace();
        } catch (IOException e) {

            e.printStackTrace();
        }
        return sd;
    }

    public String orgfind(String cnt[]) {
        InputStream is;
        TokenNameFinderModel tnf;
        NameFinderME nf;
        String sd = "";
        try {
            is = new FileInputStream("en-ner-organization.bin");
            tnf = new TokenNameFinderModel(is);
            nf = new NameFinderME(tnf);
            Span sp[] = nf.find(cnt);
            String a[] = Span.spansToStrings(sp, cnt);
            StringBuilder fd = new StringBuilder();
            int l = a.length;
            for (int j = 0; j < l; j++) {
                fd = fd.append(a[j] + "\n");

            }

            sd = fd.toString();

        } catch (FileNotFoundException e) {

            e.printStackTrace();
        } catch (InvalidFormatException e) {

            e.printStackTrace();
        } catch (IOException e) {

            e.printStackTrace();
        }
        return sd;

    }
    public void tokenization(String tokens) {

        InputStream is;
        TokenizerModel tm;
        try {
            is = new FileInputStream("en-token.bin");
            tm = new TokenizerModel(is);
            Tokenizer tz = new TokenizerME(tm);
            Tokens = tz.tokenize(tokens);
            // System.out.println(Tokens[1]);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

}
can you please let me know where i did wrong...??                               
          

Reply via email to