Bayesian classifiers using Lucene as data store
-----------------------------------------------
Key: LUCENE-1039
URL: https://issues.apache.org/jira/browse/LUCENE-1039
Project: Lucene - Java
Issue Type: New Feature
Reporter: Karl Wettin
Priority: Minor
Bayesian classifiers using Lucene as data store. Based on the Naive Bayes and
Fisher method algorithms as described by Toby Segaran in "Programming
Collective Intelligence", ISBN 978-0-596-52932-1.
Have fun.
Poor java docs, but the TestCase shows how to use it:
{code:java}
public class TestClassifier extends TestCase {
public void test() throws Exception {
InstanceFactory instanceFactory = new InstanceFactory() {
public Document factory(String text, String _class) {
Document doc = new Document();
doc.add(new Field("class", _class, Field.Store.YES,
Field.Index.NO_NORMS));
doc.add(new Field("text", text, Field.Store.YES, Field.Index.NO,
Field.TermVector.NO));
doc.add(new Field("text/ngrams/start", text, Field.Store.NO,
Field.Index.TOKENIZED, Field.TermVector.YES));
doc.add(new Field("text/ngrams/inner", text, Field.Store.NO,
Field.Index.TOKENIZED, Field.TermVector.YES));
doc.add(new Field("text/ngrams/end", text, Field.Store.NO,
Field.Index.TOKENIZED, Field.TermVector.YES));
return doc;
}
Analyzer analyzer = new Analyzer() {
private int minGram = 2;
private int maxGram = 3;
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream ts = new StandardTokenizer(reader);
ts = new LowerCaseFilter(ts);
if (fieldName.endsWith("/ngrams/start")) {
ts = new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.FRONT,
minGram, maxGram);
} else if (fieldName.endsWith("/ngrams/inner")) {
ts = new NGramTokenFilter(ts, minGram, maxGram);
} else if (fieldName.endsWith("/ngrams/end")) {
ts = new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.BACK,
minGram, maxGram);
}
return ts;
}
};
public Analyzer getAnalyzer() {
return analyzer;
}
};
Directory dir = new RAMDirectory();
new IndexWriter(dir, null, true).close();
Instances instances = new Instances(dir, instanceFactory, "class");
instances.addInstance("hello world", "en");
instances.addInstance("hallå världen", "sv");
instances.addInstance("this is london calling", "en");
instances.addInstance("detta är london som ringer", "sv");
instances.addInstance("john has a long mustache", "en");
instances.addInstance("john har en lång mustache", "sv");
instances.addInstance("all work and no play makes jack a dull boy", "en");
instances.addInstance("att bara arbeta och aldrig leka gör jack en trist
gosse", "sv");
instances.addInstance("shrimp sandwich", "en");
instances.addInstance("räksmörgås", "sv");
instances.addInstance("it's now or never", "en");
instances.addInstance("det är nu eller aldrig", "sv");
instances.addInstance("to tie up at a landing-stage", "en");
instances.addInstance("att angöra en brygga", "sv");
instances.addInstance("it's now time for the children's television shows",
"en");
instances.addInstance("nu är det dags för barnprogram", "sv");
instances.flush();
testClassifier(instances, new NaiveBayesClassifier());
testClassifier(instances, new FishersMethodClassifier());
instances.close();
}
private void testClassifier(Instances instances, BayesianClassifier
classifier) throws IOException {
assertEquals("sv", classifier.classify(instances, "detta blir ett
test")[0].getClassification());
assertEquals("en", classifier.classify(instances, "this will be a
test")[0].getClassification());
// test training data instances. all ought to match!
for (int documentNumber = 0; documentNumber <
instances.getIndexReader().maxDoc(); documentNumber++) {
if (!instances.getIndexReader().isDeleted(documentNumber)) {
Map<Term, Double> features =
instances.extractFeatures(instances.getIndexReader(), documentNumber,
classifier.isNormalized());
Document document = instances.getIndexReader().document(documentNumber);
assertEquals(document.get("class"), classifier.classify(instances,
features)[0].getClassification());
}
}
}
{code}
--
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]