Retrieving Term vectors

Sarita Nair Tue, 19 Mar 2013 08:31:27 -0700

Hi All,

I am in the process upgrading from Solr 3.6.2 to Solr 4.1 and have been running 
into 
problems with retrieving Term vector information.


Below is the test and source code.  The tests fails with a 
NullPointerException, because DocsAndPositionsEnum is always null, despite the 
fact that I have stored term vectors. Any ideas on what is it that I am doing 
incorrectly, will be greatly appreciated.

public class LuceneUtilTest {

private final RAMDirectory ramDirectory = new RAMDirectory();
private static final String TERM_POSITION_PROVIDER = "term position provider";
private AtomicReader atomicReader;
private DirectoryReader dr;

@BeforeClass
public void init() throws IOException {
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_41, new 
StandardAnalyzer(
Version.LUCENE_41));
iwc.setOpenMode(OpenMode.CREATE);
IndexWriter writer = new IndexWriter(ramDirectory, iwc);
FieldType fieldType = new FieldType();
IndexOptions indexOptions = 
IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
fieldType.setIndexOptions(indexOptions);
fieldType.setIndexed(true);
fieldType.setStoreTermVectors(true);
fieldType.setStored(true);
Document doc = new Document();
doc.add(new Field("content", "one quick brown fox jumped over one lazy dog", 
fieldType));
writer.addDocument(doc);
writer.commit();
writer.close();
dr = DirectoryReader.open(ramDirectory);
atomicReader = dr.leaves().get(0).reader();

}

@DataProvider(name = TERM_POSITION_PROVIDER)
public Object[][] getTermPositions() {
List<Object[]> data = new ArrayList<Object[]>();
data.add(new Object[] { "brown", new int[] { 2 } });
data.add(new Object[] { "dog", new int[] { 8 } });
data.add(new Object[] { "fox", new int[] { 3 } });
data.add(new Object[] { "jumped", new int[] { 4 } });
data.add(new Object[] { "lazy", new int[] { 7 } });
data.add(new Object[] { "one", new int[] { 0, 6 } });
data.add(new Object[] { "over", new int[] { 5 } });
data.add(new Object[] { "quick", new int[] { 1 } });
return data.toArray(new Object[data.size()][]);
}

@AfterClass
public void destroy() throws IOException {
atomicReader.close();
}

@Test(dataProvider = TERM_POSITION_PROVIDER)
public void testGetTermPositionMap(String query, int[] position) throws 
IOException,
ParseException {
Map<String, int[]> posMap = LuceneUtil.getTermPositionMap(atomicReader, 0, 
"content");
Assert.assertEquals(posMap.get(query)[0], position[0]);
}

}



#Method being tested

public final class LuceneUtil {
/**
 * Private constructor to prevent instantiation.
 */
private LuceneUtil() {}

/**
 * 
 * @param reader the index reader
 * @param fieldName name of the field of interest
 * @param docId internal doc ID of the document of interest
 * @return all Terms present in the requested field
 * @throws IOException on IndexReader error
 */
public static Terms getTerms(final AtomicReader reader, final String fieldName, 
final int docId) throws IOException{
return reader.getTermVector(docId, fieldName);
}

/**
 * Returns a map of the terms and their token positions for a field in a 
 * document. The map may be empty because vector information is not available 
 * for the requested field, or because the analyzer assigned to it found no 
 * terms in the original document field at index time.
 * 
 * @param reader Lucene index reader (for access to term vector info)
 * @param docId the internal Lucene ID of the document of interest
 * @param fieldName name of the field of interest
 * @return a map of term/positions pairs; the map may be empty.
 * @throws IOException on IndexReader error
 */
public static Map<String, int[]> getTermPositionMap(final AtomicReader reader,
final int docId, final String fieldName) throws IOException {
Map<String, int[]> termPosMap = new HashMap<String, int[]>();
Terms terms = LuceneUtil.getTerms(reader, fieldName, docId);
if(terms!=null) {
TermsEnum termsEnum = terms.iterator(TermsEnum.EMPTY);
BytesRef term;
while ((term=termsEnum.next())!=null) {
String docTerm = term.utf8ToString();
DocsAndPositionsEnum docPosEnum = 
termsEnum.docsAndPositions(reader.getLiveDocs(),
null,
DocsAndPositionsEnum.FLAG_OFFSETS);
docPosEnum.nextDoc();
int freq = docPosEnum.freq();
int[] posArray = new int[freq];
for (int i = 0; i < freq; i++) {
int position = docPosEnum.nextPosition();
posArray[i]=position;
}
termPosMap.put(docTerm, posArray);
}
}
return termPosMap;
}

}

Retrieving Term vectors

Reply via email to