On Thu, Jan 17, 2013 at 9:08 AM, Robert Muir <rcm...@gmail.com> wrote: > Which statistics in particular (which methods)?
I'd like to know the frequency of each term in each document. Those term counts for the most frequent terms in the corpus will make it into the document vectors for clustering. Looking at Terms and TermsEnum, I'm actually somewhat baffled about how to do this. Iterating over the TermsEnums in a Terms retrieved by IndexReader.getTermVector() will tell me about the presence of a term within a document, but I don't see a simple "count" or "freq" method in TermsEnum--the methods there look like corpus statistics. Based on Ian's reply, I created the following one-file test program. The results I get are weird: I get a term vector back for the first document, but not for the second. Output: doc 0 had term 'baz' doc 0 had term 'foobar' doc 0 had term 'gibberish' doc 0 had 3 terms doc 1 had no term vector for body Thanks again for the responses and assistance. Jon import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.SlowCompositeReaderWrapper; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Version; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.FieldType; public class LuceneTest { static void createIndex(final String path) throws IOException, CorruptIndexException { final Directory dir = FSDirectory.open(new File(path)); final StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_40); final IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40, analyzer); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); iwc.setRAMBufferSizeMB(256.0); final IndexWriter writer = new IndexWriter(dir, iwc); final FieldType bodyOptions = new FieldType(); bodyOptions.setIndexed(true); bodyOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); bodyOptions.setStored(true); bodyOptions.setStoreTermVectors(true); bodyOptions.setTokenized(true); final Document doc = new Document(); doc.add(new Field("body", "this foobar is gibberish, baz", bodyOptions)); writer.addDocument(doc); final Document doc2 = new Document(); doc.add(new Field("body", "I don't know what to tell you, qux. Some foobar is just fubar.", bodyOptions)); writer.addDocument(doc2); writer.close(); } static void readIndex(final String path) throws IOException, CorruptIndexException { final DirectoryReader dirReader = DirectoryReader.open(FSDirectory.open(new File(path))); final SlowCompositeReaderWrapper rdr = new SlowCompositeReaderWrapper(dirReader); int max = rdr.maxDoc(); TermsEnum term = null; // iterate docs for (int i = 0; i < max; ++i) { // get term vector for body field final Terms terms = rdr.getTermVector(i, "body"); if (terms != null) { // count terms in doc int numTerms = 0; term = terms.iterator(term); while (term.next() != null) { System.out.println("doc " + i + " had term '" + term.term().utf8ToString() + "'"); ++numTerms; // would like to record doc term frequencies here, i.e., counts[i][term.term()] = term.freq() } System.out.println("doc " + i + " had " + numTerms + " terms"); } else { System.err.println("doc " + i + " had no term vector for body"); } } } public static void main(String[] args) throws IOException, InterruptedException, CorruptIndexException { final String path = args[0]; createIndex(path); readIndex(path); } } -- Jon Stewart, Principal (646) 719-0317 | j...@lightboxtechnologies.com | Arlington, VA --------------------------------------------------------------------- To unsubscribe, e-mail: java-user-unsubscr...@lucene.apache.org For additional commands, e-mail: java-user-h...@lucene.apache.org