typo time. You need doc2.add(...) not 2 doc.add(...) statements.
-- Ian. On Thu, Jan 17, 2013 at 2:49 PM, Jon Stewart <j...@lightboxtechnologies.com> wrote: > On Thu, Jan 17, 2013 at 9:08 AM, Robert Muir <rcm...@gmail.com> wrote: >> Which statistics in particular (which methods)? > > I'd like to know the frequency of each term in each document. Those > term counts for the most frequent terms in the corpus will make it > into the document vectors for clustering. > > Looking at Terms and TermsEnum, I'm actually somewhat baffled about > how to do this. Iterating over the TermsEnums in a Terms retrieved by > IndexReader.getTermVector() will tell me about the presence of a term > within a document, but I don't see a simple "count" or "freq" method > in TermsEnum--the methods there look like corpus statistics. > > Based on Ian's reply, I created the following one-file test program. > The results I get are weird: I get a term vector back for the first > document, but not for the second. > > Output: > doc 0 had term 'baz' > doc 0 had term 'foobar' > doc 0 had term 'gibberish' > doc 0 had 3 terms > doc 1 had no term vector for body > > Thanks again for the responses and assistance. > > > Jon > > > import java.io.File; > import java.io.IOException; > > import org.apache.lucene.analysis.standard.StandardAnalyzer; > > import org.apache.lucene.index.IndexWriter; > import org.apache.lucene.index.IndexWriterConfig.OpenMode; > import org.apache.lucene.index.IndexWriterConfig; > import org.apache.lucene.index.FieldInfo.IndexOptions; > import org.apache.lucene.index.CorruptIndexException; > import org.apache.lucene.index.AtomicReader; > import org.apache.lucene.index.IndexableField; > import org.apache.lucene.index.Terms; > import org.apache.lucene.index.TermsEnum; > import org.apache.lucene.index.SlowCompositeReaderWrapper; > import org.apache.lucene.index.DirectoryReader; > > import org.apache.lucene.store.Directory; > import org.apache.lucene.store.FSDirectory; > > import org.apache.lucene.util.BytesRef; > import org.apache.lucene.util.Version; > > import org.apache.lucene.document.Document; > import org.apache.lucene.document.Field; > import org.apache.lucene.document.StringField; > import org.apache.lucene.document.FieldType; > > public class LuceneTest { > > static void createIndex(final String path) throws IOException, > CorruptIndexException { > final Directory dir = FSDirectory.open(new File(path)); > final StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_40); > final IndexWriterConfig iwc = new > IndexWriterConfig(Version.LUCENE_40, analyzer); > iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); > iwc.setRAMBufferSizeMB(256.0); > final IndexWriter writer = new IndexWriter(dir, iwc); > > final FieldType bodyOptions = new FieldType(); > bodyOptions.setIndexed(true); > > bodyOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); > bodyOptions.setStored(true); > bodyOptions.setStoreTermVectors(true); > bodyOptions.setTokenized(true); > > final Document doc = new Document(); > doc.add(new Field("body", "this foobar is gibberish, baz", bodyOptions)); > writer.addDocument(doc); > > final Document doc2 = new Document(); > doc.add(new Field("body", "I don't know what to tell you, qux. > Some foobar is just fubar.", bodyOptions)); > writer.addDocument(doc2); > > writer.close(); > } > > static void readIndex(final String path) throws IOException, > CorruptIndexException { > final DirectoryReader dirReader = > DirectoryReader.open(FSDirectory.open(new File(path))); > final SlowCompositeReaderWrapper rdr = new > SlowCompositeReaderWrapper(dirReader); > > int max = rdr.maxDoc(); > > TermsEnum term = null; > // iterate docs > for (int i = 0; i < max; ++i) { > // get term vector for body field > final Terms terms = rdr.getTermVector(i, "body"); > if (terms != null) { > // count terms in doc > int numTerms = 0; > term = terms.iterator(term); > while (term.next() != null) { > System.out.println("doc " + i + " had term '" + > term.term().utf8ToString() + "'"); > ++numTerms; > > // would like to record doc term frequencies here, i.e., > counts[i][term.term()] = term.freq() > } > System.out.println("doc " + i + " had " + numTerms + " terms"); > } > else { > System.err.println("doc " + i + " had no term vector for body"); > } > } > } > > public static void main(String[] args) throws IOException, > InterruptedException, CorruptIndexException { > final String path = args[0]; > createIndex(path); > readIndex(path); > } > } > > -- > Jon Stewart, Principal > (646) 719-0317 | j...@lightboxtechnologies.com | Arlington, VA > > --------------------------------------------------------------------- > To unsubscribe, e-mail: java-user-unsubscr...@lucene.apache.org > For additional commands, e-mail: java-user-h...@lucene.apache.org > --------------------------------------------------------------------- To unsubscribe, e-mail: java-user-unsubscr...@lucene.apache.org For additional commands, e-mail: java-user-h...@lucene.apache.org