Hello!Help me please!
The program is not working TermFreqVector for the Russian documents,
although I use RussianAnalyzer.
The program sorts the frequency of words in documents.
public class JavaApplication1 {
public static File dataDir = new File("C:/filestoindex");
public static File indexDir = new File("C:/fileindex");
public static void index(File indexDir,File dataDir) throws IOException
{
if (!dataDir.exists() || !dataDir.isDirectory())
{
throw new IOException(dataDir + " does not exist or is not a
directory");
}
Analyzer a=new RussianAnalyzer(Version.LUCENE_30);
IndexWriter indexWriter = new
IndexWriter(FSDirectory.open(indexDir),a, true,
IndexWriter.MaxFieldLength.UNLIMITED);
indexDirectory(indexWriter, dataDir);
indexWriter.close();
}
private static void indexDirectory(IndexWriter writer, File dir)
throws IOException {
File[] files = dir.listFiles();
for (int i = 0; i < files.length; i++)
{ File f = files[i];
if (f.isDirectory())
{ indexDirectory(writer, f);
}
indexFile(writer, f);
}
}
private static void indexFile(IndexWriter writer, File f) throws
IOException
{
System.out.println("Индексация " + f.getName());
Document doc = new Document();
doc.add(new Field("contents" , new FileReader(f),
Field.TermVector.YES));
doc.add(new Field("filename", f.getName(), Field.Store.YES,
Field.Index.NOT_ANALYZED));
doc.add(new Field("path", f.getCanonicalPath(), Field.Store.YES,
Field.Index.NOT_ANALYZED));
writer.addDocument(doc);
}
public static void main(String[] args) throws Exception
{
index(indexDir, dataDir);
// File indexDirr = new File("C:/indexMaterial");
IndexReader reader = IndexReader.open(FSDirectory.open(indexDir));
for (int docNum=0; docNum<reader.numDocs(); docNum++)
{
HashMap<String,Integer> totalTfv = new
HashMap<String,Integer>(1024);
TermFreqVector tfv = reader.getTermFreqVector(docNum, "contents");
if (tfv == null)
{
continue;
}
String terms[] = tfv.getTerms();
int termCount = terms.length;
int freqs[] = tfv.getTermFrequencies();
for (int t=0; t < termCount; t++)
{
String term = terms[t];
int freq = freqs[t];
Integer totalFreq = totalTfv.get(term);
totalFreq = (totalFreq == null) ? freq : freq + totalFreq;
totalTfv.put(term, totalFreq);
}
List<Entry<String, Integer>> entries = new
ArrayList<Entry<String, Integer>>(totalTfv.entrySet());
Collections.sort(entries, new Comparator<Entry<String,
Integer>>() {
@Override
public int compare(Entry<String, Integer> e1, Entry<String,
Integer> e2) {
int v1 = e1.getValue(); // can be NPE when autounboxing
int v2 = e2.getValue();
return (v1 < v2) ? 1 : (v1 == v2) ? 0 : -1;
}
});
String adres="C:/fileout/"
+"out"+reader.document(docNum).getField("filename").stringValue()+".txt";
FileOutputStream fr = new FileOutputStream(adres);
for (Entry<String, Integer> e : entries)
{
System.out.println(reader.document(docNum).getField("filename").stringValue()+"
"+e.getKey() + "/" + e.getValue());
String
st=reader.document(docNum).getField("filename").stringValue()+" "+
e.getKey() + "/" + e.getValue()+ "\r\n";
fr.write(st.getBytes("UTF-8") );
}
fr.close();
}
}
}
--
View this message in context:
http://lucene.472066.n3.nabble.com/TermFreqVector-for-the-Russian-documents-tp4052069.html
Sent from the Lucene - General mailing list archive at Nabble.com.