Author: schor Date: Fri Jan 20 19:33:07 2012 New Revision: 1234089 URL: http://svn.apache.org/viewvc?rev=1234089&view=rev Log: [UIMA-239] support parsing UIMA descriptors in a mode that preserves comments and formatting whitespace.
Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/resource/metadata/impl/MetaDataObject_impl.java uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/XMLParser.java uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/XMLSerializer.java uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/SaxDeserializer_impl.java uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/XMLParser_impl.java Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/resource/metadata/impl/MetaDataObject_impl.java URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/resource/metadata/impl/MetaDataObject_impl.java?rev=1234089&r1=1234088&r2=1234089&view=diff ============================================================================== --- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/resource/metadata/impl/MetaDataObject_impl.java (original) +++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/resource/metadata/impl/MetaDataObject_impl.java Fri Jan 20 19:33:07 2012 @@ -49,20 +49,25 @@ import org.apache.uima.util.InvalidXMLEx import org.apache.uima.util.NameClassPair; import org.apache.uima.util.XMLParser; import org.apache.uima.util.XMLSerializer; +import org.apache.uima.util.XMLSerializer.CharacterValidatingContentHandler; import org.apache.uima.util.XMLizable; +import org.w3c.dom.CharacterData; import org.w3c.dom.Comment; +import org.w3c.dom.DOMException; +import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; +import org.w3c.dom.Text; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; +import org.xml.sax.ext.LexicalHandler; import org.xml.sax.helpers.AttributesImpl; /** * Abstract base class for all MetaDataObjects in the reference implementation. Provides basic - * support for getting and setting property values given their names, by storing all attribute - * values in a HashMap keyed on attribute name. + * support for getting and setting property values given their names, using bean introspection and reflection. * <p> * Also provides the ability to write objects to XML and build objects from their DOM * representation, as required to implement the {@link XMLizable} interface, which is a @@ -92,6 +97,13 @@ public abstract class MetaDataObject_imp private transient URL mSourceUrl; + // This is only used if we are capturing comments and ignorable whitespace in the XML + private transient Node infoset = null; // by default, set to null + + public void setInfoset(Node infoset) { + this.infoset = infoset; + } + /** * Creates a new <code>MetaDataObject_impl</code> with null attribute values */ @@ -549,11 +561,7 @@ public abstract class MetaDataObject_imp * a Writer to which the XML string will be written */ public void toXML(Writer aWriter) throws SAXException, IOException { - XMLSerializer sax2xml = new XMLSerializer(aWriter); - ContentHandler contentHandler = sax2xml.getContentHandler(); - contentHandler.startDocument(); - toXML(sax2xml.getContentHandler(), true); - contentHandler.endDocument(); + toXML(new XMLSerializer(aWriter)); } /** @@ -563,11 +571,14 @@ public abstract class MetaDataObject_imp * an OutputStream to which the XML string will be written */ public void toXML(OutputStream aOutputStream) throws SAXException, IOException { - XMLSerializer sax2xml = new XMLSerializer(aOutputStream); + toXML(new XMLSerializer(aOutputStream)); + } + + private void toXML(XMLSerializer sax2xml) throws SAXException, IOException { ContentHandler contentHandler = sax2xml.getContentHandler(); contentHandler.startDocument(); toXML(sax2xml.getContentHandler(), true); - contentHandler.endDocument(); + contentHandler.endDocument(); } /** @@ -583,7 +594,7 @@ public abstract class MetaDataObject_imp public void toXML(ContentHandler aContentHandler, boolean aWriteDefaultNamespaceAttribute) throws SAXException { XmlizationInfo inf = getXmlizationInfo(); - + // write the element's start tag // get attributes (can be provided by subclasses) AttributesImpl attrs = getXMLAttributes(); @@ -595,16 +606,22 @@ public abstract class MetaDataObject_imp } // start element - aContentHandler.startElement(inf.namespace, inf.elementTagName, inf.elementTagName, attrs); - - // write child elements - for (int i = 0; i < inf.propertyInfo.length; i++) { - PropertyXmlInfo propInf = inf.propertyInfo[i]; - writePropertyAsElement(propInf, inf.namespace, aContentHandler); + outputStartElement(aContentHandler, infoset, inf.namespace, inf.elementTagName, inf.elementTagName, attrs); + // write child elements + + CharacterValidatingContentHandler cc = (CharacterValidatingContentHandler) aContentHandler; + cc.lastOutputNodeAddLevel(); + try { + for (int i = 0; i < inf.propertyInfo.length; i++) { + PropertyXmlInfo propInf = inf.propertyInfo[i]; + writePropertyAsElement(propInf, inf.namespace, aContentHandler); + } + } finally { + cc.lastOutputNodeClearLevel(); } - + // end element - aContentHandler.endElement(inf.namespace, inf.elementTagName, inf.elementTagName); + outputEndElement(aContentHandler, infoset, inf.namespace, inf.elementTagName, inf.elementTagName); } /** @@ -666,10 +683,9 @@ public abstract class MetaDataObject_imp return; // if XML element name was supplied, write a tag - if (aPropInfo.xmlElementName != null) { - aContentHandler.startElement(aNamespace, aPropInfo.xmlElementName, aPropInfo.xmlElementName, + Node elementNode = findMatchingSubElement(aContentHandler, aPropInfo.xmlElementName); + outputStartElement(aContentHandler, elementNode, aNamespace, aPropInfo.xmlElementName, aPropInfo.xmlElementName, EMPTY_ATTRIBUTES); - } // get class of property Class propClass = getAttributeClass(aPropInfo.propertyName); @@ -701,9 +717,7 @@ public abstract class MetaDataObject_imp } // if XML element name was supplied, end the element that we started - if (aPropInfo.xmlElementName != null) { - aContentHandler.endElement(aNamespace, aPropInfo.xmlElementName, aPropInfo.xmlElementName); - } + outputEndElement(aContentHandler, elementNode, aNamespace, aPropInfo.xmlElementName, aPropInfo.xmlElementName); } /** @@ -729,47 +743,51 @@ public abstract class MetaDataObject_imp throws SAXException { // if aPropClass is generic Object, reader won't know whether to expect // an array, so we tell it be writing an "array" element here. + Node arraySubElement = findMatchingSubElement(aContentHandler, "array"); if (aPropClass == Object.class) { - aContentHandler.startElement(aNamespace, "array", "array", EMPTY_ATTRIBUTES); + outputStartElement(aContentHandler, arraySubElement, aNamespace, "array", "array", EMPTY_ATTRIBUTES); } // iterate through elements of the array (at this point we don't allow // nested arrays here int len = ((Object[]) aValue).length; - for (int i = 0; i < len; i++) { - Object curElem = Array.get(aValue, i); - - // if a particular array element tag has been specified, write it - if (aArrayElementTagName != null) { - aContentHandler.startElement(aNamespace, aArrayElementTagName, aArrayElementTagName, - EMPTY_ATTRIBUTES); - } - - // if attribute's value is an XMLizable object, call its toXML method - if (curElem instanceof XMLizable) { - ((XMLizable) curElem).toXML(aContentHandler); - } - // else, attempt to write it as a primitive - else { - if (aArrayElementTagName == null) { - // need to include the type, e.g. <string> - XMLUtils.writePrimitiveValue(curElem, aContentHandler); - } else { - // don't include the type - just write the value - String valStr = curElem.toString(); - aContentHandler.characters(valStr.toCharArray(), 0, valStr.length()); + CharacterValidatingContentHandler cc = (CharacterValidatingContentHandler) aContentHandler; + cc.lastOutputNodeAddLevel(); + try { + for (int i = 0; i < len; i++) { + Object curElem = Array.get(aValue, i); + Node matchingArrayElement = findMatchingSubElement(aContentHandler, aArrayElementTagName); + + // if a particular array element tag has been specified, write it + outputStartElement(aContentHandler, matchingArrayElement, aNamespace, aArrayElementTagName, aArrayElementTagName, + EMPTY_ATTRIBUTES); + + // if attribute's value is an XMLizable object, call its toXML method + if (curElem instanceof XMLizable) { + ((XMLizable) curElem).toXML(aContentHandler); + } + // else, attempt to write it as a primitive + else { + if (aArrayElementTagName == null) { + // need to include the type, e.g. <string> + XMLUtils.writePrimitiveValue(curElem, aContentHandler); + } else { + // don't include the type - just write the value + String valStr = curElem.toString(); + aContentHandler.characters(valStr.toCharArray(), 0, valStr.length()); + } } + + // if we started an element, end it + outputEndElement(aContentHandler, matchingArrayElement, aNamespace, aArrayElementTagName, aArrayElementTagName); } - - // if we started an element, end it - if (aArrayElementTagName != null) { - aContentHandler.endElement(aNamespace, aArrayElementTagName, aArrayElementTagName); - } + } finally { + cc.lastOutputNodeClearLevel(); } // if we started an "Array" element, end it if (aPropClass == Object.class) { - aContentHandler.endElement(aNamespace, "array", "array"); + outputEndElement(aContentHandler, arraySubElement, aNamespace, "array", "array"); } } @@ -797,55 +815,54 @@ public abstract class MetaDataObject_imp String aKeyXmlAttribute, String aValueTagName, boolean aOmitIfNull, String aNamespace, ContentHandler aContentHandler) throws SAXException { // get map - Map theMap = (Map) getAttributeValue(aPropName); - + @SuppressWarnings("unchecked") + Map<String, Object> theMap = (Map<String, Object>) getAttributeValue(aPropName); + Node matchingNode = findMatchingSubElement(aContentHandler, aXmlElementName); + // if map is empty handle appropriately if (theMap == null || theMap.isEmpty()) { if (!aOmitIfNull && aXmlElementName != null) { - aContentHandler - .startElement(aNamespace, aXmlElementName, aXmlElementName, EMPTY_ATTRIBUTES); - aContentHandler.endElement(aNamespace, aXmlElementName, aXmlElementName); + outputStartElement(aContentHandler, matchingNode, aNamespace, aXmlElementName, aXmlElementName, EMPTY_ATTRIBUTES); + outputEndElement(aContentHandler, matchingNode, aNamespace, aXmlElementName, aXmlElementName); } } else { // write start tag for attribute if desired - if (aXmlElementName != null) { - aContentHandler - .startElement(aNamespace, aXmlElementName, aXmlElementName, EMPTY_ATTRIBUTES); - } - - // iterate over entries in the Map - Set entries = theMap.entrySet(); - Iterator i = entries.iterator(); - while (i.hasNext()) { - Map.Entry curEntry = (Map.Entry) i.next(); - String key = (String) curEntry.getKey(); + outputStartElement(aContentHandler, matchingNode, aNamespace, aXmlElementName, aXmlElementName, EMPTY_ATTRIBUTES); - // write a tag for the value, with a "key" attribute - AttributesImpl attrs = new AttributesImpl(); - attrs.addAttribute("", aKeyXmlAttribute, aKeyXmlAttribute, null, key); // are these nulls - // OK? - aContentHandler.startElement(aNamespace, aValueTagName, aValueTagName, attrs); - - // write the value (must be XMLizable or an array of XMLizable) - Object val = curEntry.getValue(); - if (val.getClass().isArray()) { - Object[] arr = (Object[]) val; - for (int j = 0; j < arr.length; j++) { - XMLizable elem = (XMLizable) arr[j]; - elem.toXML(aContentHandler); + CharacterValidatingContentHandler cc = (CharacterValidatingContentHandler) aContentHandler; + cc.lastOutputNodeAddLevel(); + try { + // iterate over entries in the Map + for (Map.Entry<String, Object> curEntry : theMap.entrySet()) { + String key = curEntry.getKey(); + + // write a tag for the value, with a "key" attribute + AttributesImpl attrs = new AttributesImpl(); + attrs.addAttribute("", aKeyXmlAttribute, aKeyXmlAttribute, null, key); // are these nulls OK? + Node innerMatchingNode = findMatchingSubElement(aContentHandler, aValueTagName); + outputStartElement(aContentHandler, innerMatchingNode, aNamespace, aValueTagName, aValueTagName, attrs); + + // write the value (must be XMLizable or an array of XMLizable) + Object val = curEntry.getValue(); + if (val.getClass().isArray()) { + Object[] arr = (Object[]) val; + for (int j = 0; j < arr.length; j++) { + XMLizable elem = (XMLizable) arr[j]; + elem.toXML(aContentHandler); + } + } else { + ((XMLizable) val).toXML(aContentHandler); } - } else { - ((XMLizable) val).toXML(aContentHandler); + + // write end tag for the value + outputEndElement(aContentHandler, innerMatchingNode, aNamespace, aValueTagName, aValueTagName); } - - // write end tag for the value - aContentHandler.endElement(aNamespace, aValueTagName, aValueTagName); + } finally { + cc.lastOutputNodeClearLevel(); } // if we wrote start tag for attribute, now write end tag - if (aXmlElementName != null) { - aContentHandler.endElement(aNamespace, aXmlElementName, aXmlElementName); - } + outputEndElement(aContentHandler, matchingNode, aNamespace, aXmlElementName, aXmlElementName); } } @@ -889,6 +906,10 @@ public abstract class MetaDataObject_imp throw new InvalidXMLException(InvalidXMLException.INVALID_ELEMENT_TYPE, new Object[] { getXmlizationInfo().elementTagName, aElement.getTagName() }); + if (aOptions.preserveComments) { + infoset = aElement; + } + // get child elements, each of which represents a property List<String> foundProperties = new ArrayList<String>(); NodeList childNodes = aElement.getChildNodes(); @@ -921,9 +942,6 @@ public abstract class MetaDataObject_imp readUnknownPropertyValueFromXMLElement(curElem, aParser, aOptions, foundProperties); } } - } else if (curNode instanceof Comment) { - Comment curElem = (Comment) curNode; - String comment = curElem.getData(); } } } @@ -1285,4 +1303,287 @@ public abstract class MetaDataObject_imp return pd; } + + /** + * Heuristics for comment and whitespace processing + * + * Example: + * <!-- at top --> + * <a> <!-- same line --> + * <b/> + * <d> <!-- cmt --> <e/> </d> + * <c/> <!-- same line --> + * <!-- unusual case, following final one at a level --> + * </a> <!-- same line --> + * <!-- at bottom --> + * + * Each element has 2 calls: + * startElement, endElement + * Surround these with: + * maybeOutputCommentsBefore + * maybeOutputCommentsAfter + * + * Detect top level (by fact that parent is null), and for top level: + * collect all above -> output before startelement + * collect all below -> output after endelement + * + * For normal element node, "start": + * --> output before element + * collect all prev siblings up to first newline (assume before that, the comment goes with the node having children) + * if no nl assume comments go with previous element, and skip here + * (stop looking if get null for getPreviousSibling()) + * (stop looking if get other than comment or ignorable whitespace) + * (ignorable whitespace not always distinguishable from text that is whitespace?) + * --> output after element: + * if element children: eg: <start> <!-- cmt --> + * collect all up to and including first nl before first child + * (stop at first Element node; if no nl, then the source had multiple elements on one line: + * associate the comments and whitespace with previous (and output them). + * + * if no element children: - means it's written <xxx/> or <xxx></xxx> or <xxx> something </xxx> + * output nothing - after comments will be done following endElement call + * + * For normal element node, "end": + * --> output before element + * if element children: + * collect all after last child Element; skip all up to first nl (assume before that, the comment goes with last child node) + * if no nl (e.g. </lastChild> <!-- cmt --> </elementBeingEnded> ) + * assume comments go with previous element, and skip here + * (stop looking if get null for getNextSibling()) + * (stop looking if get Element) + * + * if no element children - output nothing + * --> output after element + * if this element has no successor sibling elements + * collect all up to the null + * else + * collect all up to and including first nl from getNextSibling(). + * (stop at first Element) + * + * For implied element nodes (no Java model object corresponding) + * We have only the "parent" node, and the element name. Try to do matching on the element name + * In this case, we always are working with the children in the Dom infoset; we have a last-outputted reference + * Scan from last-outputted, to find element match, and then use that element as the "root". + * + */ + + /** + * CoIw = Comment or IgnorableWhitespace + * + */ + + private void maybeOutputCoIwBeforeStart(ContentHandler contentHandler, Node node) throws SAXException { + if (null == node) { + return; + } + if (node.getParentNode() instanceof Document) { + + // Special handling for top node: + // The SAX parser doesn't do callbacks for whitespace that come before the top node. + // It does do callbacks for comments, though. + + // For this case, we do (one time) insert of "nl" as follows: + // 1 nl before top element + // 1 nl before each preceeding comment + + outputNL(contentHandler); + + for (Node c = node.getParentNode().getFirstChild(); c != node; c = c.getNextSibling()) { + if (c instanceof Comment) { + outputCoIw(contentHandler, c); + outputNL(contentHandler); + } + } + return; + } + for (Node p = getFirstPrevCoIw(node); p != node; p = p.getNextSibling()) { + outputCoIw(contentHandler, p); + } + } + + private void maybeOutputCoIwAfterStart(ContentHandler contentHandler, Node node) throws SAXException { + if (null == node || (!hasElementChildNode(node))) { + return; + } + + for (Node n = node.getFirstChild(); isCoIw(n); n = n.getNextSibling()) { + outputCoIw(contentHandler, n); + if (hasNewline(n)) { + return; + } + } + } + + private void maybeOutputCoIwBeforeEnd(ContentHandler contentHandler, Node node) throws SAXException { + if (null == node || (!hasElementChildNode(node))) { + return; + } + Node n = node.getLastChild(); + Node np = null; + boolean newlineFound = false; + for (Node p = n; p != null && !(p instanceof Element); p = p.getPreviousSibling()) { + if (hasNewline(p)) { + newlineFound = true; + } + np = p; + } + if (!newlineFound) { + return; + } + for (Node o = skipUpToFirstAfterNL(np); o != null; o = o.getNextSibling()) { + outputCoIw(contentHandler, o); + } + } + + private void maybeOutputCoIwAfterEnd(ContentHandler contentHandler, Node node) throws SAXException { + if (null == node) { + return; + } + for (Node o = node.getNextSibling(); isCoIw(o); o = o.getNextSibling()) { + outputCoIw(contentHandler, o); + if (hasNewline(o)) { + break; + } + } + } + + /** + * Scan from last output node the child nodes, looking for a matching element. + * Side effect if found - set lastoutput node to the found one. + * @param contentHandler + * @param elementName + * @return null (if no match) or matching node + */ + private Node findMatchingSubElement(ContentHandler contentHandler, String elementName) { + if (null == infoset || null == elementName) { + return null; + } + CharacterValidatingContentHandler c = (CharacterValidatingContentHandler) contentHandler; + Node lastOutput = c.getLastOutputNode(); + Node n = (lastOutput == null) ? infoset.getFirstChild() : lastOutput.getNextSibling(); + for (; n != null; n = n.getNextSibling()) { + if ((n instanceof Element) && + elementName.equals(((Element)n).getTagName())) { + c.setLastOutputNode(n); + return n; + } + } + return null; + } + + /** + * Scan backwards from argument node, continuing until get something other than + * comment or ignorable whitespace. + * Return the first node after a nl + * If no nl found, return original node + * + * NOTE: never called with original == the top node + * @param r - guaranteed non-null + * @return + */ + private Node getFirstPrevCoIw(Node original) { + boolean newlineFound = false; + Node p = original; // tracks one behind r + for (Node r = p.getPreviousSibling(); isCoIw(r); r = r.getPreviousSibling()) { + if (hasNewline(r)) { + newlineFound = true; + } + p = r; + } + if (!newlineFound) { + return original; + } + return skipUpToFirstAfterNL(p); + } + + /** + * Skip nodes going forwards until find one with a nl, then return the one following + * @param n must not be null, and there must be a NL in the siblings + * @return node following the one with a new line + */ + private Node skipUpToFirstAfterNL(Node n) { + while (!hasNewline(n)) { + n = n.getNextSibling(); + } + return n.getNextSibling(); + } + + private boolean hasNewline(Node n) { + if (n instanceof Comment) { + return false; + } + CharacterData c = (CharacterData) n; + return (-1) != c.getData().indexOf('\n'); + } + + private boolean hasElementChildNode(Node n) { + for (Node c = n.getFirstChild(); (c != null); c = c.getNextSibling()) { + if (c instanceof Element) { + return true; + } + } + return false; + } + + private void outputCoIw(ContentHandler contentHandler, Node p) throws DOMException, SAXException { + if (p instanceof Comment) { + Comment c = (Comment)p; + ((LexicalHandler)contentHandler).comment(c.getData().toCharArray(), 0, c.getLength()); + } else { + String s = p.getTextContent(); + contentHandler.characters(s.toCharArray(), 0, s.length()); + } + + } + + private boolean isCoIw(Node n) { + return (n != null) && ((n instanceof Comment) || isWhitespaceText(n)); + } + + private boolean isWhitespaceText(Node n) { + if (!(n instanceof Text)) { + return false; + } + Text t = (Text) n; + String s = t.getData(); + for (int i = 0; i < s.length(); i++) { + if (!Character.isWhitespace(s.charAt(i))) { + return false; + } + } + return true; + } + + private void outputStartElement(ContentHandler aContentHandler, + Node node, + String aNamespace, + String localname, + String qname, + Attributes attributes) throws SAXException { + if (null == localname) { + return; + } + maybeOutputCoIwBeforeStart(aContentHandler, node); + aContentHandler.startElement(aNamespace, localname, qname, attributes); + maybeOutputCoIwAfterStart(aContentHandler, node); + } + + private void outputEndElement(ContentHandler aContentHandler, + Node node, + String aNamespace, + String localname, + String qname) throws SAXException { + if (null == localname) { + return; + } + maybeOutputCoIwBeforeEnd(aContentHandler, node); + aContentHandler.endElement(aNamespace, localname, qname); + maybeOutputCoIwAfterEnd(aContentHandler, node); + } + + private static final char[] nlca = new char[] {'\n'}; + private void outputNL(ContentHandler contentHandler) throws SAXException { + contentHandler.characters(nlca, 0, 1); + } + } Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/XMLParser.java URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/XMLParser.java?rev=1234089&r1=1234088&r2=1234089&view=diff ============================================================================== --- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/XMLParser.java (original) +++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/XMLParser.java Fri Jan 20 19:33:07 2012 @@ -815,13 +815,18 @@ public interface XMLParser { * @deprecated XInclude is no longer supported */ @Deprecated - public boolean expandXIncludes; + public boolean expandXIncludes; /** * Whether to expand <envVarRef>VARNAME</envVarRef> elements by substituting the * value of the System proprery VARNAME. */ public boolean expandEnvVarRefs; + + /** + * Whether to preserve comments and ignorable whitespace + */ + public boolean preserveComments = false; /** * Creates a new ParsingOptions object. @@ -834,7 +839,7 @@ public interface XMLParser { * @deprecated XInclude is no longer supported */ @Deprecated - public ParsingOptions(boolean aExpandXIncludes, boolean aExpandEnvVarRefs) { + public ParsingOptions(boolean aExpandXIncludes, boolean aExpandEnvVarRefs) { expandXIncludes = aExpandXIncludes; expandEnvVarRefs = aExpandEnvVarRefs; } Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/XMLSerializer.java URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/XMLSerializer.java?rev=1234089&r1=1234088&r2=1234089&view=diff ============================================================================== --- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/XMLSerializer.java (original) +++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/XMLSerializer.java Fri Jan 20 19:33:07 2012 @@ -21,6 +21,8 @@ package org.apache.uima.util; import java.io.OutputStream; import java.io.Writer; +import java.util.ArrayList; +import java.util.List; import javax.xml.transform.OutputKeys; import javax.xml.transform.Result; @@ -41,6 +43,7 @@ import org.xml.sax.ContentHandler; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; +import org.xml.sax.ext.LexicalHandler; /** * Utility class that generates XML output from SAX events or DOM nodes. @@ -55,7 +58,7 @@ public class XMLSerializer { private OutputStream mOutputStream; private Writer mWriter; - + public XMLSerializer() { this(true); } @@ -78,6 +81,10 @@ public class XMLSerializer { } } + public void setIndent(boolean yes) { + mTransformer.setOutputProperty(OutputKeys.INDENT, yes ? "yes" : "no"); + } + public XMLSerializer(OutputStream aOutputStream) { this(); setOutputStream(aOutputStream); @@ -156,10 +163,28 @@ public class XMLSerializer { } } - static class CharacterValidatingContentHandler implements ContentHandler { + public static class CharacterValidatingContentHandler implements ContentHandler, LexicalHandler { ContentHandler mHandler; boolean mXml11; + private List<Node> mLastOutputNode = new ArrayList<Node>(); // the last output node for repeated subelement nodes + + public void lastOutputNodeAddLevel() { + mLastOutputNode.add(null); + } + + public void setLastOutputNode(Node n) { + mLastOutputNode.set(mLastOutputNode.size() -1, n); + } + + public Node getLastOutputNode() { + return mLastOutputNode.get(mLastOutputNode.size() -1); + } + + public void lastOutputNodeClearLevel() { + mLastOutputNode.remove(mLastOutputNode.size() -1); + } + CharacterValidatingContentHandler(boolean xml11, ContentHandler serializerHandler) { mHandler = serializerHandler; mXml11 = xml11; @@ -264,6 +289,17 @@ public class XMLSerializer { " character: " + ch[index] + ", 0x" + Integer.toHexString(ch[index]), null); } - } + } + + public void comment(char[] ch, int start, int length) throws SAXException { + ((LexicalHandler)mHandler).comment(ch, start, length); + } + + public void endCDATA() throws SAXException {} + public void endDTD() throws SAXException {} + public void endEntity(String arg0) throws SAXException {} + public void startCDATA() throws SAXException {} + public void startDTD(String arg0, String arg1, String arg2) throws SAXException {} + public void startEntity(String arg0) throws SAXException {} } } Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/SaxDeserializer_impl.java URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/SaxDeserializer_impl.java?rev=1234089&r1=1234088&r2=1234089&view=diff ============================================================================== --- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/SaxDeserializer_impl.java (original) +++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/SaxDeserializer_impl.java Fri Jan 20 19:33:07 2012 @@ -112,7 +112,17 @@ public class SaxDeserializer_impl implem public XMLizable getObject() throws InvalidXMLException { // COMMENT NODEs may be present, and getDocumentElement would skip it... Node rootDomNode = ((Document) mDOMResult.getNode()).getDocumentElement(); - +// NodeList children = mDOMResult.getNode().getChildNodes(); +// for (int i = 0; i < children.getLength(); i ++) { +// System.out.format("Child: %s", children.item(i).getNodeName()); +// if (children.item(i) instanceof Text) { +// String s = children.item(i).getTextContent(); +// for (int j = 0; j < s.length(); j++) { +// System.out.format(" %d", s.codePointAt(j)); +// } +// } +// System.out.print("\n"); +// } // build the object XMLizable result = mUimaXmlParser.buildObject((Element) rootDomNode, mOptions); @@ -122,12 +132,12 @@ public class SaxDeserializer_impl implem return result; } - + /** * @see org.xml.sax.ContentHandler#characters(char[], int, int) */ public void characters(char[] ch, int start, int length) throws SAXException { - // System.out.println("SaxDeserializer_impl::characters"); +// System.out.format("SaxDeserializer_impl::characters: %s%n", new String(ch, start, length)); mTransformerHandler.characters(ch, start, length); } @@ -160,7 +170,10 @@ public class SaxDeserializer_impl implem * @see org.xml.sax.ContentHandler#ignorableWhitespace(char[], int, int) */ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { - mTransformerHandler.ignorableWhitespace(ch, start, length); +// System.out.format("SaxDeserializer_impl::ignorableWS: %s%n", new String(ch, start, length)); + if (mOptions.preserveComments) { + mTransformerHandler.ignorableWhitespace(ch, start, length); + } } /** Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/XMLParser_impl.java URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/XMLParser_impl.java?rev=1234089&r1=1234088&r2=1234089&view=diff ============================================================================== --- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/XMLParser_impl.java (original) +++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/XMLParser_impl.java Fri Jan 20 19:33:07 2012 @@ -137,7 +137,7 @@ public class XMLParser_impl implements X // Turn on namespace support factory.setNamespaceAware(true); - SAXParser parser = factory.newSAXParser(); // in the future, if performance issue, can save this , and reuse with reset() + SAXParser parser = factory.newSAXParser(); // unless multi-threaded, in the future, if performance issue, can save this , and reuse with reset() XMLReader reader = parser.getXMLReader(); reader.setFeature("http://xml.org/sax/features/namespace-prefixes", true); @@ -173,7 +173,9 @@ public class XMLParser_impl implements X // Parse with SaxDeserializer SaxDeserializer deser = new SaxDeserializer_impl(this, aOptions); reader.setContentHandler(deser); - reader.setProperty ("http://xml.org/sax/properties/lexical-handler", deser); + if (aOptions.preserveComments) { + reader.setProperty ("http://xml.org/sax/properties/lexical-handler", deser); + } reader.parse(input); // if there was an exception, throw it @@ -287,12 +289,21 @@ public class XMLParser_impl implements X UIMA_IllegalStateException.COULD_NOT_INSTANTIATE_XMLIZABLE, new Object[] { cls .getName() }, e); } - - // construct the XMLizable object from the XML element - object.buildFromXMLElement(aElement, this, aOptions); + + callBuildFromXMLElement(aElement, object, aOptions); return object; } + + private void callBuildFromXMLElement(Element aElement, XMLizable object, ParsingOptions aOptions) + throws InvalidXMLException { + if (aOptions.preserveComments && (object instanceof MetaDataObject_impl)) { + ((MetaDataObject_impl)object).setInfoset(aElement); + } + + object.buildFromXMLElement(aElement, this, aOptions); + + } /* * (non-Javadoc) @@ -326,8 +337,7 @@ public class XMLParser_impl implements X } // construct the XMLizable object from the XML element - object.buildFromXMLElement(aElement, this, aOptions); - + callBuildFromXMLElement(aElement, object, aOptions); return object; }