Author: lehmi Date: Fri Jan 31 18:48:17 2014 New Revision: 1563215 URL: http://svn.apache.org/r1563215 Log: PDFBOX-1860: don't escape formatting close tags as proposed by Cheng Leong
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java?rev=1563215&r1=1563214&r2=1563215&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java Fri Jan 31 18:48:17 2014 @@ -215,7 +215,7 @@ public class PDFText2HTML extends PDFTex @Override protected void writeParagraphEnd() throws IOException { - writeString(fontState.clear()); + super.writeString(fontState.clear()); // do not escape HTML super.writeParagraphEnd(); } Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java?rev=1563215&r1=1563214&r2=1563215&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java (original) +++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java Fri Jan 31 18:48:17 2014 @@ -30,16 +30,16 @@ import junit.framework.TestCase; public class TestPDFText2HTML extends TestCase { - private PDDocument createDocument() throws IOException { + private PDDocument createDocument(String title, PDFont font, String text) throws IOException { PDDocument doc = new PDDocument(); + doc.getDocumentInformation().setTitle(title); PDPage page = new PDPage(); doc.addPage(page); - PDFont font = PDType1Font.HELVETICA; PDPageContentStream contentStream = new PDPageContentStream(doc, page); contentStream.beginText(); contentStream.setFont(font, 12); contentStream.moveTextPositionByAmount(100, 700); - contentStream.drawString("<foo>"); + contentStream.drawString(text); contentStream.endText(); contentStream.close(); return doc; @@ -47,15 +47,23 @@ public class TestPDFText2HTML extends Te public void testEscapeTitle() throws IOException { PDFTextStripper stripper = new PDFText2HTML("UTF-8"); - PDDocument doc = createDocument(); - doc.getDocumentInformation().setTitle("<script>\u3042"); + PDDocument doc = createDocument("<script>\u3042", PDType1Font.HELVETICA, "<foo>"); String text = stripper.getText(doc); Matcher m = Pattern.compile("<title>(.*?)</title>").matcher(text); assertTrue(m.find()); assertEquals("<script>あ", m.group(1)); - + assertTrue(text.indexOf("<foo>") >= 0); - + } + + public void testStyle() throws IOException { + PDFTextStripper stripper = new PDFText2HTML("UTF-8"); + PDDocument doc = createDocument("t", PDType1Font.HELVETICA_BOLD, "<bold>"); + String text = stripper.getText(doc); + + Matcher bodyMatcher = Pattern.compile("<p>(.*?)</p>").matcher(text); + assertTrue("body p exists", bodyMatcher.find()); + assertEquals("body p", "<b><bold></b>", bodyMatcher.group(1)); } }