Hi,
i attach the final java class for hocr2pdf with jericho and itext.
Ciao!
--
You received this message because you are subscribed to the Google Groups
"ocropus" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To post to this group, send email to [email protected].
To view this discussion on the web visit
https://groups.google.com/d/msgid/ocropus/3717f9db-63d9-471d-8bc3-12b2939f38b2%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.
package test;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.StartTag;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Font;
import com.itextpdf.text.FontFactory;
import com.itextpdf.text.Image;
import com.itextpdf.text.Rectangle;
import com.itextpdf.text.pdf.CMYKColor;
import com.itextpdf.text.pdf.PdfContentByte;
import com.itextpdf.text.pdf.PdfWriter;
/**
* @author Federico Tarantino
*/
public class OcrService {
/**
* from hocr html and image create a pdf, with itext and jericho
* @param hocrFile (html hocr file, generated with tesseract or other ocr software)
* @param inputFile (source image file)
* @param outputFile (outputstream where write pdf)
*/
public static void hocr2pdf(File hocrFile, File inputFile, OutputStream outputFile){
try {
// The resolution of a PDF file (using iText) is 72pt per inch
float pointsPerInch = 72.0f;
// Using the jericho library to parse the HTML file
Source source = new Source(hocrFile);
// Load the image
Image image = Image.getInstance(inputFile.getAbsolutePath());
float dotsPerPointX;
float dotsPerPointY;
if(image.getDpiX()>0){
dotsPerPointX = image.getDpiX() / pointsPerInch;
dotsPerPointY = image.getDpiY() / pointsPerInch;
} else {
dotsPerPointX = 1.0f;
dotsPerPointY = 1.0f;
}
float pageImagePixelHeight = image.getHeight();
Document pdfDocument = new Document(new Rectangle(image.getWidth() / dotsPerPointX, image.getHeight() / dotsPerPointY));
PdfWriter pdfWriter = PdfWriter.getInstance(pdfDocument, outputFile);
pdfDocument.open();
// first define a standard font for our text
Font defaultFont = FontFactory.getFont(FontFactory.TIMES, 8, Font.NORMAL, CMYKColor.BLACK);
// Put the text behind the picture (reverse for debugging)
PdfContentByte cb = pdfWriter.getDirectContentUnder();
//PdfContentByte cb = pdfWriter.getDirectContent();
image.scaleToFit(image.getWidth() / dotsPerPointX, image.getHeight() / dotsPerPointY);
image.setAbsolutePosition(0, 0);
// Put the image in front of the text (reverse for debugging)
pdfWriter.getDirectContent().addImage(image);
// In order to place text behind the recognised text snippets we are interested in the bbox property
Pattern bboxPattern = Pattern.compile("bbox(\\s+\\d+){4}");
// This pattern separates the coordinates of the bbox property
Pattern bboxCoordinatePattern = Pattern.compile("(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)");
// Only tags of the ocr_line class are interesting
StartTag ocrLineTag = source.getNextStartTag(0, "class", "ocr_line", false);
while(ocrLineTag != null) {
Element lineElement = ocrLineTag.getElement();
Matcher bboxMatcher = bboxPattern.matcher(lineElement.getAttributeValue("title"));
if(bboxMatcher.find()) {
// We found a tag of the ocr_line class containing a bbox property
Matcher bboxCoordinateMatcher = bboxCoordinatePattern.matcher(bboxMatcher.group());
bboxCoordinateMatcher.find();
int[] coordinates = {Integer.parseInt((bboxCoordinateMatcher.group(1))),
Integer.parseInt((bboxCoordinateMatcher.group(2))),
Integer.parseInt((bboxCoordinateMatcher.group(3))),
Integer.parseInt((bboxCoordinateMatcher.group(4)))};
String line = lineElement.getContent().getTextExtractor().toString();
float bboxWidthPt = (coordinates[2] - coordinates[0]) / dotsPerPointX;
float bboxHeightPt = (coordinates[3] - coordinates[1]) / dotsPerPointY;
// Put the text into the PDF
cb.beginText();
// Comment the next line to debug the PDF output (visible Text)
cb.setTextRenderingMode(PdfContentByte.TEXT_RENDER_MODE_INVISIBLE);
// Scale the text width to fit the OCR bbox
boolean textScaled = false;
do {
float lineWidth = defaultFont.getBaseFont().getWidthPoint(line, bboxHeightPt);
if(lineWidth < bboxWidthPt){
textScaled = true;
} else {
bboxHeightPt-=0.1f;
}
} while (textScaled==false);
//put text in the document
cb.setFontAndSize(defaultFont.getBaseFont(), bboxHeightPt);
cb.moveText((float)(coordinates[0] / dotsPerPointX), (float)((pageImagePixelHeight - coordinates[3]) / dotsPerPointY));
cb.showText(line);
cb.endText();
}
ocrLineTag = source.getNextStartTag(ocrLineTag.getEnd(), "class", "ocr_line", false);
}
pdfDocument.close();
pdfWriter.close();
} catch (DocumentException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}