[jira] Created: (PDFBOX-936) No HTML Header using PDFText2HTML

Clement Igonet (JIRA) Thu, 06 Jan 2011 02:34:16 -0800

No HTML Header using PDFText2HTML
---------------------------------

                 Key: PDFBOX-936
                 URL: https://issues.apache.org/jira/browse/PDFBOX-936
             Project: PDFBox
          Issue Type: Bug
    Affects Versions: 1.3.1
         Environment: Ubuntu 10.10 / Netbeans / Java version "1.6.0_22"
            Reporter: Clement Igonet
             Fix For: 1.5.0, 1.4.0



The following code should output html string with this header:
<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN
"http://www.w3.org/TR/html4/loose.dtd";>
<html><head><title></title>

... but it does not !

Here is te test code:

package fr.def.iss.vd2.mod_instruction_gui.view;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import org.apache.pdfbox.exceptions.COSVisitorException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.edit.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.util.PDFText2HTML;

public class Test {

    public static void main(final String[] args) {
        byte[] buf = rawText2Pdf("Hell world");
        String html = pdf2Html(buf);
        System.out.println("html:" + html);
    }

    public static byte[] rawText2Pdf(String text) {
        ByteArrayOutputStream os = null;
        try {
            os = new ByteArrayOutputStream();
            PDDocument document =
                    new PDDocument();
            PDPage page = new PDPage();
            document.addPage(page);
            PDFont font =
                    PDType1Font.HELVETICA_BOLD;
            PDPageContentStream contentStream =
                    new PDPageContentStream(
                    document, page);
            contentStream.beginText();
            contentStream.setFont(font, 12);
            contentStream.moveTextPositionByAmount(
                    100, 700);
            contentStream.drawString(text);
            contentStream.endText();
            contentStream.close();
            document.save(os);
            document.close();
        } catch (COSVisitorException ex) {
            ex.printStackTrace();
        } catch (IOException ex) {
            ex.printStackTrace();
        }
        byte[] result = null;
        if (os != null) {
            result = os.toByteArray();
        }
        return result;
    }

    public static String pdf2Html(byte[] pdf) {
        String result = null;
        ByteArrayOutputStream os = null;
        PDFText2HTML stripper = null;
        StringBuilder buf = new StringBuilder();
        try {
            stripper = new PDFText2HTML("utf-8");
            ByteArrayInputStream is =
                    new ByteArrayInputStream(pdf);
            PDDocument document =
                    PDDocument.load(is);
            os = new ByteArrayOutputStream();
            Writer writer =
                    new OutputStreamWriter(os, "utf-8");
            stripper.writeText(document, writer);
            writer.close();
            os.close();
            result = buf.toString()
                    + stripper.getText(document);
        } catch (IOException ex) {
            ex.printStackTrace();
        }
        return result;
    }
}


-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.

[jira] Created: (PDFBOX-936) No HTML Header using PDFText2HTML

Reply via email to