No HTML Header using PDFText2HTML
---------------------------------
Key: PDFBOX-936
URL: https://issues.apache.org/jira/browse/PDFBOX-936
Project: PDFBox
Issue Type: Bug
Affects Versions: 1.3.1
Environment: Ubuntu 10.10 / Netbeans / Java version "1.6.0_22"
Reporter: Clement Igonet
Fix For: 1.5.0, 1.4.0
The following code should output html string with this header:
<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN
"http://www.w3.org/TR/html4/loose.dtd">
<html><head><title></title>
... but it does not !
Here is te test code:
package fr.def.iss.vd2.mod_instruction_gui.view;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import org.apache.pdfbox.exceptions.COSVisitorException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.edit.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.util.PDFText2HTML;
public class Test {
public static void main(final String[] args) {
byte[] buf = rawText2Pdf("Hell world");
String html = pdf2Html(buf);
System.out.println("html:" + html);
}
public static byte[] rawText2Pdf(String text) {
ByteArrayOutputStream os = null;
try {
os = new ByteArrayOutputStream();
PDDocument document =
new PDDocument();
PDPage page = new PDPage();
document.addPage(page);
PDFont font =
PDType1Font.HELVETICA_BOLD;
PDPageContentStream contentStream =
new PDPageContentStream(
document, page);
contentStream.beginText();
contentStream.setFont(font, 12);
contentStream.moveTextPositionByAmount(
100, 700);
contentStream.drawString(text);
contentStream.endText();
contentStream.close();
document.save(os);
document.close();
} catch (COSVisitorException ex) {
ex.printStackTrace();
} catch (IOException ex) {
ex.printStackTrace();
}
byte[] result = null;
if (os != null) {
result = os.toByteArray();
}
return result;
}
public static String pdf2Html(byte[] pdf) {
String result = null;
ByteArrayOutputStream os = null;
PDFText2HTML stripper = null;
StringBuilder buf = new StringBuilder();
try {
stripper = new PDFText2HTML("utf-8");
ByteArrayInputStream is =
new ByteArrayInputStream(pdf);
PDDocument document =
PDDocument.load(is);
os = new ByteArrayOutputStream();
Writer writer =
new OutputStreamWriter(os, "utf-8");
stripper.writeText(document, writer);
writer.close();
os.close();
result = buf.toString()
+ stripper.getText(document);
} catch (IOException ex) {
ex.printStackTrace();
}
return result;
}
}
--
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.