import java.io.BufferedWriter; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Stack; import java.util.regex.Pattern; import javax.xml.stream.XMLEventFactory; import javax.xml.stream.XMLEventReader; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import javax.xml.stream.events.EndElement; import javax.xml.stream.events.StartElement; import javax.xml.stream.events.XMLEvent; //import org.apache.lucene.analysis.Analyzer; //import org.apache.lucene.document.Document; //import org.apache.lucene.document.Field; //import org.apache.lucene.index.CorruptIndexException; //import org.apache.lucene.index.IndexWriter; //import org.apache.lucene.store.LockObtainFailedException; //import org.exolab.castor.xml.MarshalException; //import org.exolab.castor.xml.Marshaller; //import org.exolab.castor.xml.ValidationException; //import org.exolab.castor.xml.XMLContext; //import edu.indiana.dlib.search.analyzers.CaseInsensitiveSRUSupportAnalyzer; //import edu.indiana.dlib.xml.zthes.Relation; //import edu.indiana.dlib.xml.zthes.Term; //import edu.indiana.dlib.xml.zthes.TermNote; //import edu.indiana.dlib.xml.zthes.Zthes; /** *

* A simple command-line tool that parses a directory of getty * TGN XML files and creates a Lucene index suitable for * advanced SRW searching. *

* The current implementation indexes every field listed in the * flat format with the name of the XML element. Furthermore * an ".exact" and ".facet" form will be indexed as well and a * simple "zthes" field containing a valid ZThes XML fragment * that represents the whole record. *

*/ public class GettyTGNParser { private static final Pattern FILENAME_PATTERN = Pattern.compile("(?i)^tgn(\\d+)\\.xml"); public static void main(String[] args) throws Exception { if (args.length != 2) { System.out.println("TGN Importation Tool:"); System.out.println(); System.out.println("Required parameter: [tgn files directory] [lucene index directory]"); System.out.println(" This directory must contain a TGN_CHARS.XML, TGN(X).xml."); return; } File xmlDirectory = new File(args[0]); TGNCharacterConverter converter = new TGNCharacterConverter(new File(xmlDirectory, "TGN_CHARS.xml")); FileOutputStream fos = new FileOutputStream("log.txt", true); BufferedWriter log = new BufferedWriter(new OutputStreamWriter(fos)); //Analyzer analyzer = new CaseInsensitiveSRUSupportAnalyzer(); //IndexWriter writer = new IndexWriter(args[1], analyzer); // pass through each XML file updating the index for each term int count = 0; Map> fieldToValuesMap = new HashMap>(); for (File potentialFile : xmlDirectory.listFiles()) { if (FILENAME_PATTERN.matcher(potentialFile.getName()).matches()) { System.out.println("Parsing " + potentialFile.getName()); FileInputStream is = new FileInputStream(potentialFile); XMLInputFactory factory = XMLInputFactory.newInstance(); XMLStreamReader parser = factory.createXMLStreamReader(is); Stack currentPath = new Stack(); ArrayList unauthorizedTermNames = new ArrayList(); //Term term = null; boolean relatedTo = false; StringBuffer sb = null; while (parser.hasNext()) { int eventType = parser.next(); switch (eventType) { case XMLStreamReader.CHARACTERS: if (sb != null) { sb.append(converter.convertCharacters(parser.getText())); } break; case XMLStreamReader.START_ELEMENT: currentPath.push(parser.getLocalName()); if (currentPath.peek().equalsIgnoreCase("Subject")) { count ++; //term = new Term(); fieldToValuesMap.clear(); //term.setTermId(parser.getAttributeValue(0)); addField(fieldToValuesMap, "id", parser.getAttributeValue(0)); //} else if (term == null) { // skip remaining processing, we are in a non-term-related element } else if (isPath(currentPath, "Non-Preferred_Term")) { // TODO: add code to add this to index as well } else if (isPath(currentPath, "Note_Text", "Descriptive_Note") || isPath(currentPath, "Term_Text", "Preferred_Term", "Terms") || isPath(currentPath, "Place_Type_ID", "Preferred_Place_Type", "Place_Types") || isPath(currentPath, "Term_Text", "Non-Preferred_Term", "Terms") || isPath(currentPath, "Term_ID", "Non-Preferred_Term")) { // start a buffer of all character content (at the close // tag we'll capture this field. sb = new StringBuffer(); } else if (currentPath.peek().equalsIgnoreCase("Parent_Subject_ID") || (isPath(currentPath, "Relationship_Type", "Associative_Relationship")) || (relatedTo && isPath(currentPath, " VP_Subject_ID", "Related_Subject_ID","Associative_Relationship"))) { sb = new StringBuffer(); } break; case XMLStreamReader.END_ELEMENT: //if (term == null) { // skip remaining processing //} else if (isPath(currentPath, "Note_Text", "Descriptive_Note")) { if (isPath(currentPath, "Note_Text", "Descriptive_Note")) { //TermNote note = new TermNote(); //note.setLabel("descriptive note"); //note.setContent(sb.toString().trim()); //term.addTermNote(note); addField(fieldToValuesMap, "descriptiveNote", sb.toString().trim()); } else if (isPath(currentPath, "Term_Text", "Preferred_Term", "Terms")) { //term.setTermName(sb.toString()); addField(fieldToValuesMap, "termName", sb.toString().trim()); } else if (isPath(currentPath, "Place_Type_ID", "Preferred_Place_Type", "Place_Types")) { //term.addTermCategory(sb.toString()); addField(fieldToValuesMap, "placeType", sb.toString().trim()); } else if (currentPath.peek().equalsIgnoreCase("Parent_Subject_ID")) { //Relation bt = new Relation(); //bt.setRelationType("BT"); //bt.setTermId(sb.toString()); //term.addRelation(bt); addField(fieldToValuesMap, "bt", sb.toString()); } else if (isPath(currentPath, "Relationship_Type", "Associative_Relationship") && sb.toString().equalsIgnoreCase("3000/related to")) { relatedTo = true; } else if (relatedTo && isPath(currentPath, " VP_Subject_ID", "Related_Subject_ID","Associative_Relationship")) { //Relation rt = new Relation(); //rt.setRelationType("RT"); //rt.setTermId(sb.toString()); //term.addRelation(rt); addField(fieldToValuesMap, "rt", sb.toString()); relatedTo = false; } else if (isPath(currentPath, "Subject")) { /* if (term != null) { // write the term to the index Document indexDoc = new Document(); // add each field for (String field : fieldToValuesMap.keySet()) { for (String value : fieldToValuesMap.get(field)) { indexDoc.add(new Field(field + ".exact", value, Field.Store.NO, Field.Index.TOKENIZED)); indexDoc.add(new Field(field + ".facet", value, Field.Store.YES, Field.Index.TOKENIZED)); indexDoc.add(new Field(field, value, Field.Store.NO, Field.Index.TOKENIZED)); } } // add a zthes XML record try { ByteArrayOutputStream os = new ByteArrayOutputStream(); // create a new Marshaller XMLContext context = new XMLContext(); Marshaller marshaller = context.createMarshaller(); marshaller.setValidation(false); marshaller.setSupressXMLDeclaration(true); //marshaller.setEncoding("UTF-8"); marshaller.setWriter(new OutputStreamWriter(os, "UTF-8")); // marshal the zthes record Zthes zthes = new Zthes(); zthes.addTerm(term); marshaller.marshal(zthes); String zthesXml = new String(os.toByteArray(), "UTF-8"); // The "setSuppressXMLDeclaration" doesn't seem // to be respected here, so we manually remove it. zthesXml = zthesXml.substring(zthesXml.indexOf(" stack, String... path) { int i = stack.size() - 1; for (String pathEl : path) { if (i >= 0) { if (!pathEl.equalsIgnoreCase(stack.get(i --))) { return false; } } else { return false; } } return true; } private static void addField(Map> map, String name, String value) { if (map.containsKey(name)) { map.get(name).add(value); } else { ArrayList values = new ArrayList(); values.add(value); map.put(name, values); } } private static class TGNCharacterConverter { private Map charMap; public TGNCharacterConverter(File characterXmlFile) throws FileNotFoundException, XMLStreamException { this.charMap = new HashMap(); FileInputStream is = new FileInputStream(characterXmlFile); XMLInputFactory factory = XMLInputFactory.newInstance(); XMLStreamReader parser = factory.createXMLStreamReader(is); String currentTag = null; String vcsCode = null; while (parser.hasNext()) { int eventType = parser.next(); switch (eventType) { case XMLStreamReader.CHARACTERS: if (currentTag == null) { break; } else if (currentTag.equalsIgnoreCase("VCS_CODE")) { vcsCode = parser.getText(); } else if (currentTag.equalsIgnoreCase("UNICODE")) { String unicodeList = parser.getText(); StringBuffer val = new StringBuffer(); for (String hex : unicodeList.split(" ")) { val.append((char) Integer.parseInt(hex, 16)); } charMap.put(vcsCode, "" + val); vcsCode = null; } break; case XMLStreamReader.START_ELEMENT: currentTag = parser.getLocalName(); break; case XMLStreamReader.END_ELEMENT: currentTag = null; break; default: break; } } parser.close(); } public String convertCharacters(String line) { for (String key : this.charMap.keySet()) { line = line.replace(key, this.charMap.get(key)); } return line; } } }