Author: tilman Date: Fri Oct 19 17:59:13 2018 New Revision: 1844362 URL: http://svn.apache.org/viewvc?rev=1844362&view=rev Log: PDFBOX-3646, PDFBOX-4345: fix problems with missing text and improper handling of special characters, by Kai Keggenhoff: - Instead of traversing the children of an element with the XPath "*" expression, simply iterate the children obtained from Node.getChildNodes(), process Text and CDATASection nodes directly and call richContentsToString for any elements - escape "<" and "&" in the text values read from the node values - added quoting " as " to the attribute values to avoid possible corruption
Modified: pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/fdf/FDFAnnotation.java Modified: pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/fdf/FDFAnnotation.java URL: http://svn.apache.org/viewvc/pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/fdf/FDFAnnotation.java?rev=1844362&r1=1844361&r2=1844362&view=diff ============================================================================== --- pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/fdf/FDFAnnotation.java (original) +++ pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/fdf/FDFAnnotation.java Fri Oct 19 17:59:13 2018 @@ -37,10 +37,12 @@ import org.apache.pdfbox.pdmodel.common. import org.apache.pdfbox.pdmodel.interactive.annotation.PDBorderEffectDictionary; import org.apache.pdfbox.pdmodel.interactive.annotation.PDBorderStyleDictionary; import org.apache.pdfbox.util.DateConverter; +import org.w3c.dom.CDATASection; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; +import org.w3c.dom.Text; /** * This represents an FDF annotation that is part of the FDF document. @@ -959,43 +961,49 @@ public abstract class FDFAnnotation impl private String richContentsToString(Node node, boolean root) { - String retval = ""; - XPath xpath = XPathFactory.newInstance().newXPath(); - try + String subString = ""; + + NodeList nodelist = node.getChildNodes(); + for (int i = 0; i < nodelist.getLength(); i++) { - NodeList nodelist = (NodeList) xpath.evaluate("*", node, XPathConstants.NODESET); - String subString = ""; - if (nodelist.getLength() == 0) - { - subString = node.getFirstChild().getNodeValue(); - } - for (int i = 0; i < nodelist.getLength(); i++) + Node child = nodelist.item(i); + if (child instanceof Element) { - Node child = nodelist.item(i); - if (child instanceof Element) - { - subString += richContentsToString(child, false); - } + subString += richContentsToString(child, false); } - NamedNodeMap attributes = node.getAttributes(); - StringBuilder builder = new StringBuilder(); - for (int i = 0; i < attributes.getLength(); i++) + else if (child instanceof CDATASection) { - Node attribute = attributes.item(i); - builder.append(String.format(" %s=\"%s\"", attribute.getNodeName(), - attribute.getNodeValue())); + subString += "<![CDATA[" + ((CDATASection) child).getData() + "]]>"; } - if (root) + else if (child instanceof Text) { - return subString; + String cdata = ((Text) child).getData(); + if (cdata!=null) + { + cdata = cdata.replace("&", "&").replace("<", "<"); + } + subString += cdata; } - retval = String.format("<%s%s>%s</%s>", node.getNodeName(), builder.toString(), - subString, node.getNodeName()); } - catch (XPathExpressionException e) + if (root) { - LOG.debug("Error while evaluating XPath expression for richtext contents"); + return subString; } - return retval; + + NamedNodeMap attributes = node.getAttributes(); + StringBuilder builder = new StringBuilder(); + for (int i = 0; i < attributes.getLength(); i++) + { + Node attribute = attributes.item(i); + String attributeNodeValue = attribute.getNodeValue(); + if (attributeNodeValue!=null) + { + attributeNodeValue = attributeNodeValue.replace("\"", """); + } + builder.append(String.format(" %s=\"%s\"", attribute.getNodeName(), + attributeNodeValue)); + } + return String.format("<%s%s>%s</%s>", node.getNodeName(), builder.toString(), + subString, node.getNodeName()); } -} +} \ No newline at end of file