import java.io.*;
import java.util.Set;
import java.util.HashSet;
import java.util.Random;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;

// TODO: use RAMDirectory or BatchIndexWriter
//       try with multiple threads and multiple indices
public class Words2Index
{
    // avg. word length = 8, so 8 * 3 bytes => 24 bytes
    private static final short WORDS_PER_TITLE = 3;
    // avg. word length = 8, so 8 * 2000 bytes => 16000 bytes
    private static final short WORDS_PER_BODY = 1000;

    private static String   _dict;
    private static String   _index;
    private static int      _indexSize;
    private static String[] _words;
    private static Random   _random;


    private Words2Index(String dict, String index, int indexSize)
    {
        _dict = dict;
        _index = index;
        _indexSize = indexSize;
        _random = new Random();
    }

    public static void main(String[] args)
        throws Exception
    {
        Words2Index wi = new Words2Index(args[0], args[1], Integer.parseInt(args[2]));

        System.out.println("DICT:       " + _dict);
        System.out.println("INDEX:      " + _index);
        System.out.println("INDEX SIZE: " + _indexSize + " document");

        System.out.println("Loading words from " + _dict);
        Set wordSet = wi.loadWords();
        _words = (String[])wordSet.toArray(new String[wordSet.size()]);
        int maxRandNumber = _words.length;

        IndexWriter writer = new IndexWriter(_index, new StandardAnalyzer(), true);
        writer.mergeFactor = 500;

	//BatchIndexWriter batchWriter = new BatchIndexWriter(writer, 2000);

        long startTime = System.currentTimeMillis();
        long batchStartTime = System.currentTimeMillis();
        long optimizeTime = 0;
        System.out.println("Adding documents to index " + _index);
        for (int i = 0; i < _indexSize; i++)
        {
            //batchWriter.addDocument(wi.makeDocument(maxRandNumber));
            writer.addDocument(wi.makeDocument(maxRandNumber));
            if (i % 100 == 0)
            {
                System.out.println(i + " " + String.valueOf(System.currentTimeMillis() - batchStartTime) + " ms");
                batchStartTime = System.currentTimeMillis();
            }
            if (i % 50000 == 0)
            {
                System.out.print("Optimizing...");
                long startOptimizeTime = System.currentTimeMillis();
                //batchWriter.optimize();
                writer.optimize();
                long endOptimizeTime = System.currentTimeMillis();
                long thisOptimizeTime = endOptimizeTime - startOptimizeTime;
                System.out.println("done " + String.valueOf(thisOptimizeTime));
                optimizeTime += thisOptimizeTime;
                batchStartTime = System.currentTimeMillis();
            }
        }

        System.out.println("Optimizing and closing index " + _index);
        long startOptimizeTime = System.currentTimeMillis();
        //batchWriter.optimize();
        writer.optimize();
        long endOptimizeTime = System.currentTimeMillis();
        long thisOptimizeTime = endOptimizeTime - startOptimizeTime;
        System.out.println("done " + String.valueOf(thisOptimizeTime));
        optimizeTime += thisOptimizeTime;
        writer.close();

        long endTime = System.currentTimeMillis();
        long totalTime = System.currentTimeMillis() - startTime;
        

        System.out.println("Done");
        System.out.println("Time spent optimizing: " + String.valueOf(optimizeTime) + " ms");
        System.out.println("Total time: " + String.valueOf(totalTime) + " ms");
        System.out.println("Avg time per document (not counting optimizing) " + String.valueOf((totalTime - optimizeTime)/ wi._indexSize) + " ms");
        System.out.println("Avg time per document (counting optimizing) " + String.valueOf(totalTime/ wi._indexSize) + " ms");
        System.out.println("Avg time per 1000 documents (not counting optimizing) " + String.valueOf((totalTime - optimizeTime)/ (wi._indexSize/1000)) + " ms");
        System.out.println("Avg time per 1000 documents (counting optimizing) " + String.valueOf(totalTime/ (wi._indexSize/1000)) + " ms");
    }

    private Set loadWords()
        throws IOException
    {
        int wordLength = 0;
        String word;
        Set wordSet = new HashSet(99905);

        File file = new File(_dict);
        BufferedReader br = new BufferedReader(new FileReader(file));
        while ((word = br.readLine()) != null)
        {
            //System.out.println("WORD: " + word);
            wordSet.add(word);
            wordLength += word.length();
        }
        System.out.println("WORD COUNT:        " + wordSet.size());
        System.out.println("TOTAL WORD LENGTH: " + wordLength);
        System.out.println("AVG WORD LENGTH:   " + wordLength/wordSet.size());

        return wordSet;
    }

    private Document makeDocument(int maxRandNumber)
    {
        Document doc = new Document();
        StringBuffer fieldValue = new StringBuffer(WORDS_PER_TITLE * 8);
        for (int i = 0; i < WORDS_PER_TITLE; i++)
        {
            int rand = _random.nextInt(maxRandNumber);
            fieldValue.append(" ").append(_words[rand]);
        }
        //System.out.println("Title: " + fieldValue);
        doc.add(Field.Text("title", fieldValue.toString()));

        fieldValue.setLength(0);
        fieldValue.setLength(WORDS_PER_BODY * 8);

        for (int i = 0; i < WORDS_PER_BODY; i++)
        {
            int rand = _random.nextInt(maxRandNumber);
            fieldValue.append(" ").append(_words[rand]);
        }
        //System.out.println("Body: " + fieldValue);
        doc.add(Field.UnStored("body", fieldValue.toString()));

        return doc;
    }
}

