I am trying to train en-ner-location.bin file using opennlp in java The thing
is i got the training text file in the following format <START:location>
Fontana <END> .<START:location> Palo Verde <END> .
<START:location> Picacho <END> .
I stored it in a text file called citytrain.txt the file contains 120770 line
of city namesthen i used the following code to train the fileimport
java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Collections;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.namefind.NameSampleDataStream;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
public class TrainNames {
@SuppressWarnings("deprecation")
public void TrainNames() throws IOException{
//File baseDir = new File("src/test/resources");
//File destDir = new File("target");
//<start id="ne-train"/>
File inFile = new File("citytrain.txt");
NameSampleDataStream nss = new NameSampleDataStream( //<co
id="co.opennlp.name.initnamestream"/>
new PlainTextByLineStream(
new java.io.FileReader(inFile)));
int iterations = 100;
int cutoff = 5;
TokenNameFinderModel model = NameFinderME.train( //<co
id="co.opennlp.name.train"/>
"en", // language
"location", // type
nss,
(AdaptiveFeatureGenerator) null,
Collections.<String,Object>emptyMap(),
iterations,
cutoff);
File outFile = new File("en-ner-locationNews.bin");
FileOutputStream outFileStream = new
FileOutputStream(outFile);
model.serialize(outFileStream);
}
}
when i used the output bi file to test it on a string to pull up Fontana from a
string the result returned the whole string....don't know why or what i am
doing wrongthe following code i used to extract Fontana from a stringimport
java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.Span;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import org.xml.sax.SAXException;
public class CityFinder {
public String Tokens[];
public static void main(String[] args) throws IOException, SAXException {
CityFinder toi = new CityFinder();
String cnt;
cnt="John is planning to specialize in Electrical Engineering in UC
Fontana and pursue a career with IBM.";
toi.tokenization(cnt);
String cities = toi.namefind(toi.Tokens);
String org = toi.orgfind(toi.Tokens);
System.out.println("City name is : "+cities);
System.out.println("organization name is: "+org);
}
public String namefind(String cnt[]) {
InputStream is;
TokenNameFinderModel tnf;
NameFinderME nf;
String sd = "";
try {
is = new FileInputStream("en-ner-locationNew.bin");
tnf = new TokenNameFinderModel(is);
nf = new NameFinderME(tnf);
Span sp[] = nf.find(cnt);
String a[] = Span.spansToStrings(sp, cnt);
StringBuilder fd = new StringBuilder();
int l = a.length;
for (int j = 0; j < l; j++) {
fd = fd.append(a[j] + "\n");
}
sd = fd.toString();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (InvalidFormatException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return sd;
}
public String orgfind(String cnt[]) {
InputStream is;
TokenNameFinderModel tnf;
NameFinderME nf;
String sd = "";
try {
is = new FileInputStream("en-ner-organization.bin");
tnf = new TokenNameFinderModel(is);
nf = new NameFinderME(tnf);
Span sp[] = nf.find(cnt);
String a[] = Span.spansToStrings(sp, cnt);
StringBuilder fd = new StringBuilder();
int l = a.length;
for (int j = 0; j < l; j++) {
fd = fd.append(a[j] + "\n");
}
sd = fd.toString();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (InvalidFormatException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return sd;
}
public void tokenization(String tokens) {
InputStream is;
TokenizerModel tm;
try {
is = new FileInputStream("en-token.bin");
tm = new TokenizerModel(is);
Tokenizer tz = new TokenizerME(tm);
Tokens = tz.tokenize(tokens);
// System.out.println(Tokens[1]);
} catch (IOException e) {
e.printStackTrace();
}
}
}
can you please let me know where i did wrong...??