import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Stack;
import java.util.regex.Pattern;

import javax.xml.stream.XMLEventFactory;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;

//import org.apache.lucene.analysis.Analyzer;
//import org.apache.lucene.document.Document;
//import org.apache.lucene.document.Field;
//import org.apache.lucene.index.CorruptIndexException;
//import org.apache.lucene.index.IndexWriter;
//import org.apache.lucene.store.LockObtainFailedException;
//import org.exolab.castor.xml.MarshalException;
//import org.exolab.castor.xml.Marshaller;
//import org.exolab.castor.xml.ValidationException;
//import org.exolab.castor.xml.XMLContext;

//import edu.indiana.dlib.search.analyzers.CaseInsensitiveSRUSupportAnalyzer;
//import edu.indiana.dlib.xml.zthes.Relation;
//import edu.indiana.dlib.xml.zthes.Term;
//import edu.indiana.dlib.xml.zthes.TermNote;
//import edu.indiana.dlib.xml.zthes.Zthes;

/**
 * <p>
 *   A simple command-line tool that parses a directory of getty
 *   TGN XML files and creates a Lucene index suitable for 
 *   advanced SRW searching.
 * </p>
 * <p>
 *   The current implementation indexes every field listed in the
 *   flat format with the name of the XML element.  Furthermore
 *   an ".exact" and ".facet" form will be indexed as well and a
 *   simple "zthes" field containing a valid ZThes XML fragment
 *   that represents the whole record.
 * </p>
 */
public class GettyTGNParser {

    private static final Pattern FILENAME_PATTERN = Pattern.compile("(?i)^tgn(\\d+)\\.xml");
    
    public static void main(String[] args) throws Exception {
        if (args.length != 2) {
            System.out.println("TGN Importation Tool:");
            System.out.println();
            System.out.println("Required parameter: [tgn files directory] [lucene index directory]");
            System.out.println("  This directory must contain a TGN_CHARS.XML, TGN(X).xml.");
            return;
        }
        File xmlDirectory = new File(args[0]);
        TGNCharacterConverter converter = new TGNCharacterConverter(new File(xmlDirectory, "TGN_CHARS.xml"));
        
        FileOutputStream fos = new FileOutputStream("log.txt", true);
        BufferedWriter log = new BufferedWriter(new OutputStreamWriter(fos));
        
        //Analyzer analyzer = new CaseInsensitiveSRUSupportAnalyzer();
        
        //IndexWriter writer = new IndexWriter(args[1], analyzer);
        
        // pass through each XML file updating the index for each term
        int count = 0;
        Map<String, List<String>> fieldToValuesMap = new HashMap<String, List<String>>();
        for (File potentialFile : xmlDirectory.listFiles()) {
            if (FILENAME_PATTERN.matcher(potentialFile.getName()).matches()) {
                System.out.println("Parsing " + potentialFile.getName());
                FileInputStream is = new FileInputStream(potentialFile);
                
                XMLInputFactory factory = XMLInputFactory.newInstance();
                
                XMLStreamReader parser = factory.createXMLStreamReader(is);
                Stack<String> currentPath = new Stack<String>();
                ArrayList<String> unauthorizedTermNames = new ArrayList<String>();
                //Term term = null;
                boolean relatedTo = false;
                StringBuffer sb = null;
                while (parser.hasNext()) {
                    int eventType = parser.next();
                    switch (eventType) {
                        case XMLStreamReader.CHARACTERS:
                            if (sb != null) {
                                sb.append(converter.convertCharacters(parser.getText()));
                            }
                            break;
                        case XMLStreamReader.START_ELEMENT:
                            currentPath.push(parser.getLocalName());
                            if (currentPath.peek().equalsIgnoreCase("Subject")) {
                                count ++;
                                //term = new Term();
                                fieldToValuesMap.clear();
                                //term.setTermId(parser.getAttributeValue(0));
                                addField(fieldToValuesMap, "id", parser.getAttributeValue(0));
                            //} else if (term == null) {
                                // skip remaining processing, we are in a non-term-related element
                            } else if (isPath(currentPath, "Non-Preferred_Term")) {
                                // TODO: add code to add this to index as well
                            } else if (isPath(currentPath, "Note_Text", "Descriptive_Note")
                                    || isPath(currentPath, "Term_Text", "Preferred_Term", "Terms")
                                    || isPath(currentPath, "Place_Type_ID", "Preferred_Place_Type", "Place_Types")
                                    || isPath(currentPath, "Term_Text", "Non-Preferred_Term", "Terms")
                                    || isPath(currentPath, "Term_ID", "Non-Preferred_Term")) {
                                // start a buffer of all character content (at the close
                                // tag we'll capture this field.
                                sb = new StringBuffer();
                            } else if (currentPath.peek().equalsIgnoreCase("Parent_Subject_ID") 
                                    || (isPath(currentPath, "Relationship_Type", "Associative_Relationship"))
                                    || (relatedTo && isPath(currentPath, " VP_Subject_ID", "Related_Subject_ID","Associative_Relationship"))) {
                                sb = new StringBuffer();
                            }
                            break;
                        case XMLStreamReader.END_ELEMENT:
                            //if (term == null) {
                                // skip remaining processing
                            //} else if (isPath(currentPath, "Note_Text", "Descriptive_Note")) {
                            if (isPath(currentPath, "Note_Text", "Descriptive_Note")) {
                                //TermNote note = new TermNote();
                                //note.setLabel("descriptive note");
                                //note.setContent(sb.toString().trim());
                                //term.addTermNote(note);
                                addField(fieldToValuesMap, "descriptiveNote", sb.toString().trim());
                            } else if (isPath(currentPath, "Term_Text", "Preferred_Term", "Terms")) {
                                //term.setTermName(sb.toString());
                                addField(fieldToValuesMap, "termName", sb.toString().trim());
                            } else if (isPath(currentPath, "Place_Type_ID", "Preferred_Place_Type", "Place_Types")) {
                                //term.addTermCategory(sb.toString());
                                addField(fieldToValuesMap, "placeType", sb.toString().trim());
                            } else if (currentPath.peek().equalsIgnoreCase("Parent_Subject_ID")) {
                                //Relation bt = new Relation();
                                //bt.setRelationType("BT");
                                //bt.setTermId(sb.toString());
                                //term.addRelation(bt);
                                addField(fieldToValuesMap, "bt", sb.toString());
                            } else if (isPath(currentPath, "Relationship_Type", "Associative_Relationship") && sb.toString().equalsIgnoreCase("3000/related to")) {
                                relatedTo = true;
                            } else if (relatedTo && isPath(currentPath, " VP_Subject_ID", "Related_Subject_ID","Associative_Relationship")) {
                                //Relation rt = new Relation();
                                //rt.setRelationType("RT");
                                //rt.setTermId(sb.toString());
                                //term.addRelation(rt);
                                addField(fieldToValuesMap, "rt", sb.toString());
                                relatedTo = false;
                            } else if (isPath(currentPath, "Subject")) {
                                /*
                                if (term != null) {
                                    // write the term to the index
                                    Document indexDoc = new Document();
                                    // add each field
                                    for (String field : fieldToValuesMap.keySet()) {
                                        for (String value : fieldToValuesMap.get(field)) {
                                            indexDoc.add(new Field(field + ".exact", value, Field.Store.NO, Field.Index.TOKENIZED));
                                            indexDoc.add(new Field(field + ".facet", value, Field.Store.YES, Field.Index.TOKENIZED));
                                            indexDoc.add(new Field(field, value, Field.Store.NO, Field.Index.TOKENIZED));
                                        }
                                    }
                                    // add a zthes XML record
                                    try {
                                        ByteArrayOutputStream os = new ByteArrayOutputStream();
                    
                                        // create a new Marshaller
                                        XMLContext context = new XMLContext();
                                        Marshaller marshaller = context.createMarshaller();
                                        marshaller.setValidation(false);
                                        marshaller.setSupressXMLDeclaration(true);
                                        //marshaller.setEncoding("UTF-8");
                                        marshaller.setWriter(new OutputStreamWriter(os, "UTF-8"));
                    
                                        // marshal the zthes record
                                        Zthes zthes = new Zthes();
                                        zthes.addTerm(term);
                                        marshaller.marshal(zthes);
                                        String zthesXml = new String(os.toByteArray(), "UTF-8");
                                        // The "setSuppressXMLDeclaration" doesn't seem
                                        // to be respected here, so we manually remove it.
                                        zthesXml = zthesXml.substring(zthesXml.indexOf("<Zthes"));
                                        Field xmlField = new Field("zthes", zthesXml, Field.Store.COMPRESS, Field.Index.NO);
                                        indexDoc.add(xmlField);
                                    } catch (MarshalException ex) {
                                        throw new RuntimeException(ex);
                                    } catch (ValidationException ex) {
                                        throw new RuntimeException(ex);
                                    }
                                    writer.addDocument(indexDoc, analyzer);
                                    System.out.println("Completed term " + count + ": " + term.getTermName());
                                    log.flush();
                                    term = null;
                                }
                                */
                                System.out.println("Completed term " + count + ": " + fieldToValuesMap.get("termName") + "(" + fieldToValuesMap.get("id") + ")");
                            } else if (isPath(currentPath, "Non-Preferred_Term")) {
                                // add unauthorized term
                            }
                            currentPath.pop();
                            sb = null;
                            break;
                        default:
                            break;
                    }
                }
                parser.close();
            }
        }
        //writer.optimize();
        //writer.close();
    }
    
    /**
     * Checks whether the path (strings) are equal to the last 
     * entries in the given stack.  For example, if the parameters
     * are a stack ("One", "Two", "Three") and "Two", "three": this
     * method will return true.  
     */
    private static boolean isPath(Stack<String> stack, String... path) {
        int i = stack.size() - 1;
        for (String pathEl : path) {
            if (i >= 0) {
                if (!pathEl.equalsIgnoreCase(stack.get(i --))) {
                    return false;
                }
            } else {
                return false;
            }
        }
        return true;
    }
    
    private static void addField(Map<String, List<String>> map, String name, String value) {
        if (map.containsKey(name)) {
            map.get(name).add(value);
        } else {
            ArrayList<String> values = new ArrayList<String>();
            values.add(value);
            map.put(name, values);
        }
    }
    
    private static class TGNCharacterConverter {

        private Map<String, String> charMap;
        
        public TGNCharacterConverter(File characterXmlFile) throws FileNotFoundException, XMLStreamException {
            this.charMap = new HashMap<String, String>();
            FileInputStream is = new FileInputStream(characterXmlFile);
            XMLInputFactory factory = XMLInputFactory.newInstance();
            XMLStreamReader parser = factory.createXMLStreamReader(is);
            String currentTag = null;
            String vcsCode = null;
            while (parser.hasNext()) {
                int eventType = parser.next();
                switch (eventType) {
                    case XMLStreamReader.CHARACTERS:
                        if (currentTag == null) {
                            break;
                        } else if (currentTag.equalsIgnoreCase("VCS_CODE")) {
                            vcsCode = parser.getText();
                        } else if (currentTag.equalsIgnoreCase("UNICODE")) {
                            String unicodeList = parser.getText();
                            StringBuffer val = new StringBuffer();
                            for (String hex : unicodeList.split(" ")) {
                                val.append((char) Integer.parseInt(hex, 16));
                            }
                            charMap.put(vcsCode, "" + val);
                            vcsCode = null;
                        }
                        break;
                    case XMLStreamReader.START_ELEMENT:
                        currentTag = parser.getLocalName();
                        break;
                    case XMLStreamReader.END_ELEMENT:
                        currentTag = null;
                        break;
                    default:
                        break;
                }
            }
            parser.close();
        }
        
        public String convertCharacters(String line) {
            for (String key : this.charMap.keySet()) {
                line = line.replace(key, this.charMap.get(key));
            }
            return line;
        }
        
    }
}
