Hi Mary,
I've been out of the loop with Lucene and java for a bit so this is might
not be too correct, but here is an example of how it might be accomplished
(also you can see it in this gist: https://gist.github.com/rainkinz/5645139).
The output looks like this:
** Also note I'm using Lucene 4.3, however I set the version to be
Version.LUCENE_40 for you. I don't think the APIs are different in this
case.
---------------------------------------------------
Term 'mary' appears 5 in the index
in doc 0 the term mary appears 1 times at positions 1
in doc 2 the term mary appears 1 times at positions 3
in doc 4 the term mary appears 1 times at positions 1
in doc 8 the term mary appears 1 times at positions 3
in doc 9 the term mary appears 1 times at positions 6
etc
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import java.io.IOException;
import java.util.Random;
public class CountingTerms {
private static final Version VERSION = Version.LUCENE_40;
private static final String[] terms = "hi am mary and i have a problem
with lucene".split(" ");
private final Directory indexDir = new RAMDirectory();
private String randomTerms() {
Random rand = new Random();
StringBuilder sb = new StringBuilder();
int numTerms = rand.nextInt(terms.length);
for (int i = 0; i < numTerms; i++) {
sb.append(terms[rand.nextInt(terms.length)]).append(" ");
}
return sb.toString();
}
private void addDocs(IndexWriter writer) throws IOException {
for (int i = 0; i < 10; i++) {
Document doc = new Document();
String randomStr = randomTerms();
puts("Adding random str: " + randomStr);
IndexableField field = new TextField("text", randomStr,
Field.Store.YES);
doc.add(field);
writer.addDocument(doc);
}
}
private void countTerms() throws IOException {
DirectoryReader indexReader = DirectoryReader.open(indexDir);
AtomicReader reader = indexReader.leaves().get(0).reader();
Fields fields = reader.fields();
Terms terms = fields.terms("text");
TermsEnum termsEnum = terms.iterator(null);
BytesRef term;
while ((term = termsEnum.next()) != null) {
puts("---------------------------------------------------");
puts("Term '" + term.utf8ToString() + "' appears " +
termsEnum.totalTermFreq() + " in the index");
DocsAndPositionsEnum docPosEnum =
termsEnum.docsAndPositions(reader.getLiveDocs(),
null,
DocsAndPositionsEnum.FLAG_OFFSETS);
int docid;
while ((docid = docPosEnum.nextDoc()) !=
DocsAndPositionsEnum.NO_MORE_DOCS) {
int freq = docPosEnum.freq();
int[] positions = new int[freq];
for (int i = 0; i < freq; i++) {
int position = docPosEnum.nextPosition();
positions[i]=position;
}
puts("in doc " + docid + " the term " + term.utf8ToString() + "
appears " + freq + " times at positions " + ppArray(positions));
}
}
indexReader.close();
}
private String ppArray(int[] arr) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < arr.length; i++) {
sb.append(arr[i]);
if (i + 1 < arr.length) sb.append(", ");
}
return sb.toString();
}
private void puts(Object msg) {
System.out.println(msg);
}
private void index() throws IOException {
IndexWriter indexWriter = new IndexWriter(indexDir,
new IndexWriterConfig(VERSION, new
WhitespaceAnalyzer(VERSION)));
addDocs(indexWriter);
indexWriter.commit();
indexWriter.close();
}
public static void main(String[] args) throws Exception {
CountingTerms ct = new CountingTerms();
ct.index();
ct.countTerms();
}
}
On Fri, May 24, 2013 at 12:14 PM, mary meriem <[email protected]> wrote:
> hii am mary and i have a problem with lucene, Actually a work with lucene
> 4.0.0, my problem is, how can I more listed all the terms, the display
> position for each term in each document and their frequency?please help
>
--
Brendan Grainger
www.kuripai.com