On Tue, Feb 3, 2015 at 4:29 AM, Lorena Leishman < [email protected]> wrote:
> Hi, > I have a PDF that has information displayed on tables. Example: > Company Name: Barnes & Noble Bank Of America Macy'sAccount #: > 123xxxxx 345xxxx 679xxxxStatus: > Open Closed OpenBalance: > $23. $0.00 $100 > Is there a way with PDFbox to extract a specific value(s) from the table? > Example: Bank Of America and $0.00 > And also is there a way to cut the whole table and paste it into a > different PDF? > Please let me know, Thanks! > Lorena
package input.pdf; import com.vividsolutions.jts.geom.Coordinate; import com.vividsolutions.jts.geom.LineString; import java.awt.Dimension; import java.awt.Toolkit; import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.Arrays; import javafx.application.Platform; import javafx.scene.Scene; import javafx.scene.canvas.Canvas; import javafx.scene.canvas.GraphicsContext; import javafx.scene.layout.StackPane; import javafx.scene.paint.Color; import javafx.stage.Stage; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.geotools.data.simple.SimpleFeatureCollection; import org.geotools.data.simple.SimpleFeatureIterator; import output.ShapeStyle; import topography.LineFeature; /** * * @author Frank van der Hulst <[email protected]> */ public class Display1 { private final static Logger log = Logger.getLogger(Display1.class.getName()); private final Dimension screen = Toolkit.getDefaultToolkit().getScreenSize(); private StackPane root; private GraphicsContext gc; private Stage stage = null; private Canvas canvas; private double width, height, scale, xOffset, yOffset; /** * Can be called from FX thread * * @param title * @param pageWidth * @param pageHeight * @param xOffset * @param yOffset */ @SuppressWarnings("SleepWhileInLoop") public Display1(final String title, double pageWidth, double pageHeight, double xOffset, double yOffset) { this.xOffset = xOffset; this.yOffset = yOffset; Platform.runLater(() -> { stage = new Stage(); root = new StackPane(); stage.setX(0); stage.setY(0); stage.setTitle(title); scale = (double) screen.height / pageHeight; width = pageWidth * scale; height = screen.height; Scene scene = new Scene(root, width, height); canvas = new Canvas(width, height); root.getChildren().add(canvas); stage.setWidth(width); stage.setHeight(height); stage.setScene(scene); stage.show(); gc = canvas.getGraphicsContext2D(); }); int count = 0; while (gc == null) { try { Thread.sleep(100); count++; } catch (InterruptedException ex) { } } log.trace("Waited " + (count * 100) + "ms for graphics"); } public void close() { // update display on FX thread Platform.runLater(stage::close); canvas = null; gc = null; } public Color javaFX(java.awt.Color awt) { return new javafx.scene.paint.Color(awt.getRed() / 255, awt.getGreen() / 255, awt.getBlue() / 255, awt.getAlpha() / 255); } public void drawPolyLine(final ArrayList<java.awt.Point.Float> L, final Color c, final float lw) { assert gc != null : "Null gc"; if (L == null || L.isEmpty()) { return; } // log.debug("DrawLine: " + L.size()); final int numPoints = L.size(); final double[] x = new double[numPoints]; final double[] y = new double[numPoints]; int i = 0; for (java.awt.Point.Float P : L) { x[i] = P.x * scale; y[i++] = P.y * scale; } Platform.runLater(() -> { gc.setStroke(c); gc.setFill(null); // gc.setFill(lc); gc.setLineWidth(lw); if (numPoints < 256) { gc.strokePolygon(x, y, numPoints); return; } for (int i1 = 0; i1 < numPoints; i1 += 249) { final int numPts = Math.min(250, numPoints - i1); gc.strokePolyline(Arrays.copyOfRange(x, i1, i1 + numPts), Arrays.copyOfRange(y, i1, i1 + numPts), numPts); } gc.strokeLine(x[0], y[0], x[numPoints - 1], y[numPoints - 1]); }); } public void drawSegment(float x1, float y1, float x2, float y2, final Color c, final float lw) { assert gc != null : "Null gc"; // log.debug("DrawCell: " + L.size()); final double[] x = {x1 * scale, x2 * scale}; final double[] y = {y1 * scale, y2 * scale}; Platform.runLater(() -> { gc.setStroke(c); gc.setLineWidth(lw); gc.strokePolygon(x, y, x.length); }); } public void drawRectangle(Rectangle2D.Float area, final Color c, final float lw) { assert gc != null : "Null gc"; if (area == null) { return; } // log.debug("DrawRectangle: " + area.toString()); final double[] x = {area.x * scale, (area.x + area.width) * scale, (area.x + area.width) * scale, area.x * scale}; final double[] y = {area.y * scale, area.y * scale, (area.y + area.height) * scale, (area.y + area.height) * scale}; Platform.runLater(() -> { gc.setStroke(c); // gc.setFill(lc); gc.setLineWidth(lw); gc.strokePolygon(x, y, x.length); }); } public void drawText(String str, Color c, float x, float y) { // log.debug("DrawText: " + c.toString() + str); assert gc != null : "Null gc"; if (str == null || str.isEmpty()) { return; } Platform.runLater(() -> { gc.setFill(c); gc.fillText(str, x * scale, y * scale); }); } public void minimize(boolean min) { if (min) { stage.hide(); } else { stage.show(); } } /** * Add a record for this zone * * @param src * @param style */ public void drawFeatureCollection(SimpleFeatureCollection src, ShapeStyle style) { try (SimpleFeatureIterator it = src.features()) { while (it.hasNext()) { LineFeature local = new LineFeature(it.next()); drawLineString((LineString) local.getDefaultGeometry(), style); } } } /** * Add a record for this zone * * @param style * @param ls */ public void drawLineString(LineString ls, ShapeStyle style) { Coordinate[] L = ls.getCoordinates(); assert gc != null : "Null gc"; if (L == null || L.length == 0) { return; } // log.debug("DrawLine: " + L.size()); final int numPoints = L.length; final double[] x = new double[numPoints]; final double[] y = new double[numPoints]; int i = 0; for (Coordinate P : L) { x[i] = (P.x - xOffset) * scale; y[i++] = (P.y - yOffset) * scale; } Platform.runLater(() -> { gc.setStroke(Color.BLUEVIOLET); gc.setFill(Color.BLUEVIOLET); gc.setLineWidth(1); if (numPoints < 256) { gc.strokePolyline(x, y, numPoints); return; } for (int i1 = 0; i1 < numPoints; i1 += 249) { final int numPts = Math.min(250, numPoints - i1); gc.strokePolyline(Arrays.copyOfRange(x, i1, i1 + numPts), Arrays.copyOfRange(y, i1, i1 + numPts), numPts); } }); } }
package input.pdf; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.awt.Dimension; import java.awt.Image; import java.awt.Point; import java.awt.geom.AffineTransform; import java.awt.geom.GeneralPath; import java.awt.geom.PathIterator; import java.awt.geom.Rectangle2D; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.NavigableSet; import java.util.SortedSet; import java.util.TreeSet; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdfviewer.PageDrawer; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.util.PDFOperator; import org.apache.pdfbox.util.TextPosition; /** * Scan a PDF document page by page, and extract a table delimited by lines of text that match the specified delimiters. * * @author <a href="mailto:[email protected]">Frank van der Hulst</a> * @version $Revision: 1.00 $ */ public class LinedTableStripper extends PageDrawer { private final static Logger log = Logger.getLogger(LinedTableStripper.class.getName()); /** * Top and bottom Y coordinates of the range to be scanned for the table, to exclude page headers and footers. If * null, all of each page is scanned. * */ protected float[] pageMargins; /** * ArrayList of rows, one for each row in the result table. Each row consists of an ArrayList of Strings. Rows may not * all be the same size. Phantom rows may be generated which contain nothing but empty Strings. */ private ArrayList<ArrayList<String>> table; private TreeSet<MyTextPosition> textItems = null; @SuppressWarnings("PackageVisibleField") TreeSet<TableCell> boxes = null; private TreeSet<TableCell> horizLines = null; private TreeSet<TableCell> vertLines = null; private float pageWidth, pageHeight, rotation; /** * Default constructor. * * @param pageMargins Y values on each page outside which text will be ignored, intended to exclude page headers and * footers. May be null, in which case all text is scanned. * @throws IOException If there is an error loading properties from the file. */ public LinedTableStripper(float[] pageMargins) throws IOException { super(); assert pageMargins == null || pageMargins[0] < pageMargins[1]; this.pageMargins = pageMargins; table = new ArrayList<>(0); } /** * Main entry point: Scan one page in the document. * * First populates textItems, boxes, vertLines, horizLines via calls to processTextPosition(), fillPath(), * strokePath(), drawImage(), shFill() methods from processStream() via drawPage() * * The next phase is to find the table in the extracted text by searching for lines which match the specified * patterns. * * Finally, if the table has been found, the vertical lines in the table are used to separate the text into table * cells. * * @param p The page to scan. * @param delimiter0 String containing regex pattern to identify the beginning of the table. Required. * @param delimiter1 String containing regex pattern to identify the end of the table. Required. * @return Result table * * @throws IOException If there is an IO error while drawing the page. */ @SuppressWarnings("ReturnOfCollectionOrArrayField") public ArrayList<ArrayList<String>> parsePage(PDPage p, String delimiter0, String delimiter1) throws IOException { drawPage(p); // Scan textItems for the delimiter lines float[] tableBounds = findTable(delimiter0, delimiter1); if (tableBounds[0] == Float.MAX_VALUE || tableBounds[1] == Float.MAX_VALUE) { return null; } log.debug("Table found: " + tableBounds[0] + "-" + tableBounds[1]); extractTable(tableBounds); return table; } @Override public void processOperator(String operation, List<COSBase> arguments) throws IOException { log.trace("processOperator1: " + operation + ", " + arguments.toString()); super.processOperator(operation, arguments); } @Override protected void processOperator(PDFOperator operator, List<COSBase> arguments) throws IOException { log.trace("processOperator2: " + operator.toString() + ", " + operator.getOperation() + ", " + arguments.toString()); // Ignore Resources object operator... this is used to draw embedded images, and not needed for text operations if (operator.getOperation().equals("Do")) { return; } super.processOperator(operator, arguments); } /** * Scan one page in the document. * * First populates textItems, boxes, vertLines, horizLines via calls to processTextPosition(), fillPath(), * strokePath(), drawImage(), shFill() methods from processStream(). * * In the PDF document itself, there is no requirement for the objects to be in any order... i.e. they do not need to * be sorted top to bottom. So testItems, boxes, horizLines, and vertLines are SORTED collections, to provide the data * needed by findTable() and extractTable() in the right order * * Named drawPage to indicate its functional similarity to the drawPage method in PageDrawer. * * @param p The page to scan. * * @throws IOException If there is an IO error while drawing the page. */ @SuppressWarnings("SuspiciousNameCombination") public void drawPage(PDPage p) throws IOException { // Only process the page if there is some content // Otherwise we are done here and return -- the result table is still empty if (p.getContents() == null) { return; } log.debug("drawPage()"); textItems = new TreeSet<>(); boxes = new TreeSet<>(); horizLines = new TreeSet<>(); vertLines = new TreeSet<>(); page = p; rotation = p.getRotation(); PDRectangle cropBox = p.findCropBox(); if (rotation == 90) { pageWidth = cropBox.getHeight(); pageHeight = cropBox.getWidth(); } else { pageWidth = cropBox.getWidth(); pageHeight = cropBox.getHeight(); } pageSize = new Dimension((int) pageWidth, (int) pageHeight); // Unused directly in this class, but required by PageDrawer log.debug("Page size: " + pageSize.toString() + ", rotation: " + rotation); // Extract entire page contents to textResult, cellResult, horizLines, vertLines // These are sets, sorted by Y position and X position processStream(p, p.findResources(), p.getContents().getStream()); log.debug("Scan finished: Boxes: " + boxes.size() + ", Vert " + vertLines.size() + ", Horiz " + horizLines.size() + ", Text " + textItems.size()); } /** * Find a table matching the criteria on the specified page. * * Run once for each page in the document. This expects that drawPage() has run and populated the textItems sorted * collections. * * May be overridden if there is some other mechanism to identify the top and bottom of the table. * * @param delimiter0 * @param delimiter1 * @return a table of 2 entries, being the Y coordinates of the top and bottom of the table. If either is not found, * that entry in the table Float.MAX_VALUE */ protected float[] findTable(String delimiter0, String delimiter1) { log.debug("findTable()"); assert !delimiter0.isEmpty() && !delimiter1.isEmpty(); float[] tableBounds = {Float.MAX_VALUE, Float.MAX_VALUE}; // Scan through the text for the specified delimiter text float currentY = -1000; String line = ""; for (MyTextPosition tp : textItems) { if (Math.abs(tp.tp.getY() - currentY) > tp.tp.getHeight() / 4) { if (!line.isEmpty()) { if (log.getLevel() == Level.TRACE) { System.out.println(line); } if (line.matches(delimiter0) && currentY < tableBounds[0]) { tableBounds[0] = currentY; log.debug("Delimiter 0 found at " + currentY); } if (line.matches(delimiter1) && currentY < tableBounds[1]) { tableBounds[1] = currentY; log.debug("Delimiter 1 found at " + currentY); } } line = tp.tp.getCharacter(); currentY = tp.tp.getY(); } else { line += tp.tp.getCharacter(); } } if (!line.isEmpty()) { if (line.matches(delimiter0) && currentY < tableBounds[0]) { tableBounds[0] = currentY; } if (line.matches(delimiter1) && currentY < tableBounds[1]) { tableBounds[1] = currentY; } } return tableBounds; } /** * Extract text divided at cell boundaries, and build the table. * * Creates a list of rectangle cell boundaries. The text corresponding to each cell is extracted from textItems and * inserted into the appropriate row and column of the result table. * * Where a cell contains multiple lines of text, they will be separated by newlines. Whitespace within each cell is * ignored, unless specified by an actual space character. * * The text for each cell is then inserted into a row, which is in the table table. * * This requires that the 'textItems' and the 'vertLines' sets both contain data. These have been populated earlier by * processStream(). * * This particular method uses *only* vertical lines to determine the limits of a cell. I.e. it assumes that every * cell has a separate vertical line on either side of it. This is convenient because it allows easy left-to-right, * top-to-bottom processing of the text into rows and columns. * * Assumes that the table spans the entire width of the page. However, text to the left of the first vertical line is * ignored, as is text to the right of the last vertical line. * * This method may be overridden to handle tables whose cell boundaries are defined differently. * * @param tableBounds Y coordinates of the table boundaries. */ protected void extractTable(float[] tableBounds) { assert textItems != null && vertLines != null; // Extract subset of the page's vertical lines that relates to the table TableCell tableStart = new TableCell(0, tableBounds[0]); TableCell tableEnd = new TableCell(0, tableBounds[1]); NavigableSet<TableCell> tableCells = vertLines.subSet(tableStart, true, tableEnd, true); log.debug("Vert lines in table = " + tableCells.size() + ", Text items = " + textItems.size()); // Convert vertical lines to rectangles, one for each cell // Requires that the tableCells and textResult be ordered top to bottom and left to right Iterator<TableCell> it = tableCells.iterator(); float prevCellEnd = pageWidth; float prevCellY = 0; ArrayList<String> row = null; while (it.hasNext()) { TableCell c = it.next(); c.width = c.x - prevCellEnd; prevCellEnd = c.x; if (c.width < 0) { // c.width < 0 means that the new vertical line is to the left of the previous one. // Because lines are sorted top-to-bottom, left-to-right, this means that it is the beginning of a new row // This vertical line is the lefthand end, so it doesn't represent a cell continue; } // This vertical line is the right-hand edge of a cell... calculate the corresponding left-hand edge // Height is the height of the vertical line, which should be the same height as the previous line c.x -= c.width; log.trace("Cell " + c.x + ", " + c.y + ", " + c.width + ", " + c.height); // Now extract the text relating to this cell from the sorted textItems set float currentY = -1000; float nextX = 0; MyTextPosition start = new MyTextPosition(c.x, c.y); MyTextPosition end = new MyTextPosition(c.x + c.width, c.y + c.height); SortedSet<MyTextPosition> cellText = textItems.subSet(start, end); log.debug("Cell text size: " + cellText.size()); // The text will be in the right order, left-to-right, top-to-bottom, but will also include text for // other cells on the same line... probably there's a more efficient way to deal with this than throwing away // all the ones outside this cell, but that's what happens here String text = ""; for (MyTextPosition tp : cellText) { float x = tp.tp.getX(); // Characters will leak through if they are on the same line, even if their X coordinates are out of range, // because textItems is first sorted on Y coordinates if (x > c.x && x < c.x + c.width) { if (!text.isEmpty()) { if (tp.tp.getY() - currentY > tp.tp.getHeight() / 4) { // Significant change in Y coordinate is interpreted as word-wrap text += '\n'; // Inserting spaces based on X coordinates is extremely uncertain, so it isn't done! // Depending on kerning and other factors, characters may be closer than tp.getWidthOfSpace(), yet // visually will appear as a space. // } else if (x > nextX + tp.getWidthOfSpace()) { // c.text += ' '; } else if (x < nextX) { // Bold characters are represented by printing the same character twice, slightly offset continue; } } // Replace nonbreaking space character with a space, for convenience of pattern matching and other // text processing text += tp.tp.getCharacter().replaceAll("\\u00a0", " "); currentY = tp.tp.getY(); nextX = x + tp.tp.getWidth() / 2; log.trace("Cell text: '" + text.replaceAll("\\n", "\\\\n") + "'"); } } // Finally, put the text into the appropriate cell in the output table log.debug("Cell " + c.y + ", " + prevCellY + ", " + text.replaceAll("\\n", "\\\\n")); if (c.y > prevCellY + 1) { // New row of cells found prevCellY = c.y; row = new ArrayList<>(0); table.add(row); } assert row != null; row.add(text); } } /** * Overrides the text drawing method to instead collect text and add it to the sorted list * * @param text The text to process */ @Override protected void processTextPosition(TextPosition text) { // the 0,0-reference has to be moved from the lower left (PDF) to the upper left (AWT-graphics) // to match the coordinate system of graphic lines // text.getY() manages page rotations, text.getTextPos().getYPosition() does not float y = text.getY();//pageHeight - text.getTextPos().getYPosition(); if (pageMargins != null && (y > pageMargins[1] || y + text.getHeight() < pageMargins[0])) { return; } textItems.add(new MyTextPosition(text)); } /** * Overrides the rectangle filling method to collect graphic lines into sorted sets. * * @param windingRule The winding rule this path will use. * */ @Override public void fillPath(int windingRule) { GeneralPath path = getLinePath(); Rectangle2D box = path.getBounds2D(); if (pageMargins == null || (box.getMaxY() > pageMargins[0] && box.getMinY() < pageMargins[1])) { log.trace("fillPath " + box.getMinY() + ", " + box.getMaxY()); split(path); } } /** * Overrides the line drawing method to collect graphic lines into sorted sets * */ @Override public void strokePath() { GeneralPath path = getLinePath(); Rectangle2D box = path.getBounds2D(); if (pageMargins == null || (box.getMaxY() > pageMargins[0] && box.getMinY() < pageMargins[1])) { log.trace("strokePath " + box.getMinY() + ", " + box.getMaxY()); split(path); } } /** * Overrides the area shading method to collect graphic lines into sorted sets Called by SHFill operator. * * @param shadingName The name of the Shading Dictionary to use for this fill instruction. * */ @Override public void shFill(COSName shadingName) { //Fill with Shading. log.trace("Shading = " + shadingName.toString()); GeneralPath path = new GeneralPath(getGraphicsState().getCurrentClippingPath()); Rectangle2D box = path.getBounds2D(); if (pageMargins != null && (box.getMaxY() < pageMargins[0] || box.getMinY() > pageMargins[1])) { return; } log.debug("shFill " + box.getMinY() + ", " + box.getMaxY()); split(path); } /** * Add a rectangle to the appropriate sorted list (horizLines, vertLines, boxResult). * * @param cell */ private void addCell(TableCell cell) { log.trace(cell.toString()); if (pageMargins != null && (cell.y > pageMargins[1] || cell.y + cell.height < pageMargins[0])) { return; } if (cell.height < 2) { horizLines.add(cell); } else if (cell.width < 2) { vertLines.add(cell); } else { boxes.add(cell); } log.trace("cell added " + cell.toString()); } /** * Split the path into several separate rectangles, identified by SEG_MOVETO operations. * * A single 'path' may include multiple objects. The beginning of a new object is identified by a "move to" command. * Each object is assumed to be a rectangle, so the path itself is discarded and just the boundary rectangle is * retained. * * @param path The source path to split */ protected void split(GeneralPath path) { PathIterator pathIt = path.getPathIterator(null); if (pathIt.isDone()) { return; } log.trace("split " + path.getBounds2D()); float[] coords = new float[6]; Point.Float openPoint = null; TableCell item = null; while (!pathIt.isDone()) { int type = pathIt.currentSegment(coords); if (rotation == 90) { float temp = coords[0]; coords[0] = pageWidth - coords[1] - 250; coords[1] = temp; } Point.Float newPoint = new Point.Float(coords[0], coords[1]); log.trace("" + type + " " + (type == PathIterator.SEG_CLOSE ? "" : newPoint)); switch (type) { case PathIterator.SEG_MOVETO: { if (item != null && !item.isEmpty()) { addCell(item); item = null; } openPoint = newPoint; // Save for next SEG_CLOSE break; } // Treat curved lines as straight lines... they shouldn't be here anyway case PathIterator.SEG_CUBICTO: case PathIterator.SEG_QUADTO: case PathIterator.SEG_LINETO: { break; } case PathIterator.SEG_CLOSE: { newPoint = openPoint; break; } } if (item == null) { assert newPoint != null; // Create a new rectangle to start the new shape item = new TableCell(newPoint.x, newPoint.y); } else { // Expand the boundary of this rectangle to include the new point item.add(newPoint); } pathIt.next(); } if (item != null && !item.isEmpty()) { addCell(item); } log.trace("results = " + vertLines.size()); } /** * Overrides the image drawing method to ignore images. * * If images are wanted, this could be overridden to collect them into a sorted set. * * @param awtImage The image to draw. * @param at The transformation to use when drawing. * */ @Override public void drawImage(Image awtImage, AffineTransform at) { } /** * Wrapper to sort Rectangles according to their location on the page. They are sorted according to their top edge, * from top to bottom. Where two rectangles have the same top edge, they are sorted according to their left edge. * Where two rectangles have the same top-left point, they are sorted by height, then width (this is irrelevant for * the table stripper, and just for completeness). * */ private class TableCell extends Rectangle2D.Float implements Comparable<TableCell> { private static final long serialVersionUID = 1L; TableCell(float x, float y) { super(x, y, 1, 1); } @Override public int compareTo(TableCell other) { if (this == other) { return 0; } if (y != other.y) { return java.lang.Float.compare(y, other.y); } if (x != other.x) { return java.lang.Float.compare(x, other.x); } if (height != other.height) { return java.lang.Float.compare(height, other.height); } return java.lang.Float.compare(width, other.width); } @Override public String toString() { return ("" + x + ", " + y + ", " + width + ", " + height); } } /** * Wrapper to sort TextPositions according to their location on the page. They are sorted according to their Y * coordinate, from top to bottom. Where two rectangles have the same Y coordinate, they are sorted according to their * X coordinate. This doesn't work well for superscripts (sorted ahead of normal text on the same line) and subscripts * (sorted after normal text). Quantizing Y coordinates into text lines would solve that. * * Extending TextPosition makes it complex to get to the x & y coordinates, so just embed TextPosition in this class. * */ private class MyTextPosition implements Comparable<MyTextPosition> { final TextPosition tp; final private float x, y; MyTextPosition(TextPosition tp) { this.tp = tp; y = tp.getY(); x = tp.getX(); } /** * Dummy constructor for setting the limits of a subset extraction */ MyTextPosition(float x, float y) { this.x = x; this.y = y; this.tp = null; } @Override @SuppressWarnings("AccessingNonPublicFieldOfAnotherObject") public int compareTo(MyTextPosition other) { if (other.y != y) { return Float.compare(y, other.y); } return Float.compare(x, other.x); } @Override public String toString() { return (tp.getCharacter() + " " + ", (" + x + ", " + y + ")"); } } }
package input.pdf; import airspace.Zone; import java.io.IOException; import java.lang.reflect.Array; import java.util.ArrayList; import java.util.List; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.util.PDFTextStripper; import org.apache.pdfbox.util.TextPosition; /** * * @author Frank van der Hulst <[email protected]> */ public class PdfPlateStripper extends PDFTextStripper { private final static Logger log = Logger.getLogger(PdfPlateStripper.class.getName()); protected String columns[]; protected String prevBoundId = null; protected float currentLine; protected float bottomOfTable; protected float topOfTable; protected Zone prevZone = null; protected final int[] pgTops; protected final int[] pgBottoms; protected PDFTable currentTable; protected ArrayList<String[]> result; private final List<String[]>[] results; private final PDFTable[] tables; private int tableNum; /** * @param tops * @param bottoms * @param tabs * @throws IOException */ @SuppressWarnings({"unchecked"}) public PdfPlateStripper(int[] tops, int[] bottoms, PDFTable[] tabs) throws IOException { super(); setLineSeparator("\n"); setPageEnd("\f"); this.setSortByPosition(true); pgTops = tops; pgBottoms = bottoms; tables = tabs; tableNum = 0; result = new ArrayList<>(0); results = (List<String[]>[]) Array.newInstance(result.getClass(), tables.length); currentTable = tables[tableNum]; results[tableNum] = result; startNewRow(); } private void startNewRow() { columns = new String[currentTable.columnLimits.keySet().size() - 1]; for (int i = 0; i < columns.length; i++) { columns[i] = ""; } } private void endRow(String[] columns) { // First save any data... may be no data at start of page or end of table, in which case 'columns' will be null if (columns != null && !columns[currentTable.requiredColumn].isEmpty()) { result.add(columns); } log.trace(result.size() + ": " + columns[0]); startNewRow(); } /** * Called once for each page in the PDF document. * * @throws IOException */ @Override public void writePage() throws IOException { if (tableNum >= tables.length) { return; } int pageNo = getCurrentPageNo(); if (pageNo > 1) { return; } float prevLine = -1000; String wholeLine = ""; topOfTable = pgTops[(pageNo <= pgTops.length ? pageNo : pgTops.length) - 1]; bottomOfTable = pgBottoms[(pageNo <= pgBottoms.length ? pageNo : pgBottoms.length) - 1]; log.debug("Page " + pageNo + " bounds " + topOfTable + "-" + bottomOfTable); @SuppressWarnings("UseOfObsoleteCollectionType") final java.util.Vector<List<TextPosition>> pageText = getCharactersByArticle(); for (List<TextPosition> l : pageText) { for (TextPosition tp : l) { // Replace unbreakable space with ordinary space for ease of pattern matching char ch = tp.getCharacter().charAt(0); float yPos = tp.getY(); float xPos = tp.getX(); if (ch == '\u00a0') { ch = ' '; } // Skip page header & footer (footer may be parsed before body of page) if (yPos < topOfTable || yPos > bottomOfTable) { continue; } // Check for end-of-table delimiter. This may be at any X position, so checking needs to be // separate from column handling if (yPos > prevLine) { wholeLine = "" + ch; if (log.getLevel() == Level.TRACE) { System.out.print("\n" + ch); } } else { if (log.getLevel() == Level.TRACE) { System.out.print(ch); } wholeLine += ch; if (wholeLine.equals(currentTable.tableEnd)) { // End-of-table delimiter found... step to next table. Any other data on this line is ignored. endRow(columns); tableNum++; log.debug("Table " + tableNum + " delimiter at " + yPos); if (tableNum >= tables.length) { topOfTable = bottomOfTable + 1; return; } currentTable = tables[tableNum]; topOfTable = yPos + currentTable.headerHeight; result = new ArrayList<>(0); results[tableNum] = result; startNewRow(); prevLine = yPos; continue; } } prevLine = yPos; int columnNo = currentTable.columnLimits.floorEntry(xPos).getValue(); assert (columnNo < columns.length) : "Position " + xPos + " > " + columnNo + " too big"; // If the 'required' column is empty and this is to the right of the required column, then this is // actually part of the previous line if (columnNo > currentTable.requiredColumn && columns[currentTable.requiredColumn].isEmpty() && !result.isEmpty()) { // Add character to previous row result.get(result.size() - 1)[columnNo] += ch; continue; } if (currentTable.lineEnd != null && currentTable.lineEnd.matcher(columns[columns.length - 1]).matches()) // Full row detected because the end-of-line pattern was detected in the last column. { endRow(columns); } else if (xPos < currentTable.returnColumnNum && !columns[currentTable.requiredColumn].isEmpty()) // Some tables don't have a detectable end-of-line pattern. In that case, the 'lineEnd' value is null, // and a full row is detected if the 'required' column is not empty, and this X position is left of it... // i.e. it is assumed that the cursor has moved back left of the 'required' column. { endRow(columns); } // Add character to appropriate column field. columns[columnNo] += ch; } } endRow(columns); } /** * Parse the specified tables in the specified PDF file * * @param filepath Full path to file * @return Array of Lists -- one List per table. Each list contains one String[] entry for each row in the table. Each * row is an array of Strings, with one entry for each column in the table. * @throws IOException */ @SuppressWarnings("ReturnOfCollectionOrArrayField") public List<String[]>[] parse(String filepath) throws IOException { document = PDDocument.load(filepath); getText(document); document.close(); return results; } }
package input.pdf; import airspace.Zone; import java.awt.geom.Rectangle2D; import java.io.IOException; import java.util.List; import java.util.NavigableMap; import java.util.TreeMap; import java.util.regex.Pattern; import org.apache.log4j.Logger; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; /** * Define how a table is laid out, and rules for identifying the end of a row. * * The column X-coordinate locations are specified by the caller. * * Text in a cell may be wrapped across several lines, so Y-coordinates are not used to identify when a row begins or * end. The multiple lines of a wrapped cell are unwrapped and returned as a single String. To accomplish this, a 'full' * row is recognized either because the last column entry matches the 'lineEnd' Pattern, or because the 'requiredColumn' * column is not empty and the X-coordinate is now to the left of 'returnColumnNum'... this may be the right-hand side * of the first (or other) column, or the left-hand side of the 'requiredColumn' column. * * The end of the entire table is recognized by the fact that the 'lineEnd' String is present on a line by itself. * * Multiple tables with different layouts can be specified to be extracted consecutively from a single PDF file. * * A single table may span multiple pages, or there can be several tables vertically on one page. * * @author Frank van der Hulst <[email protected]> */ public class PDFTable { protected final static Logger log = Logger.getLogger(Zone.class.getName()); public final String name; final NavigableMap<Float, Integer> columnLimits; final int headerHeight; final Pattern lineEnd; final String tableEnd; final float returnColumnNum; final int requiredColumn; /** * Construct an object to represent how a table is laid out in the PDF document * * @param tableName Identifier/descriptor of table, for debugging/documentation * @param returnX If the X-coordinate of the first item of a line is less than this, and the previous line's * 'required' column is not empty, it indicates that the previous line is full. Must be less than the left-hand edge * of the 'required' column. PDF X-coordinate. * @param reqCol Required column number. Only if this column is not empty will the line be considered to be full. * @param headerHeight Height of this table's headers in PDF Y-coordinates. This amount of space is skipped over at * the start of the table, and each page. * @param lineEnd End of line marker Pattern... set to null if the last column doesn't have a detectable pattern. * @param tableEnd End of table delimiter String... may not be null or empty. When this String is found on a line by * itself, the table is finished and the next table (if any) begun * @param columnLimits Limits of columns to extract... first entry should be 0, last entry should be max page width, * in PDF X-coordinates */ public PDFTable(String tableName, int returnX, int reqCol, int headerHeight, Pattern lineEnd, String tableEnd, float... columnLimits) { this.name = tableName; this.columnLimits = new TreeMap<>(); for (int i = 0; i < columnLimits.length; i++) { this.columnLimits.put(columnLimits[i], i); } this.returnColumnNum = returnX; this.requiredColumn = reqCol; this.headerHeight = headerHeight; this.lineEnd = lineEnd; this.tableEnd = tableEnd; assert tableEnd != null && !tableEnd.isEmpty(); assert columnLimits.length > requiredColumn + 1 : "requiredColumn too big: " + requiredColumn; assert returnX <= columnLimits[requiredColumn] : "returnX too big: " + returnX + " vs " + columnLimits[requiredColumn]; } /** * Construct a representation of how a table is laid out in the PDF document. The actual column limits are calculated * by locating the table heading and extracting boxes from that. * * @param tableName Identifier/descriptor of table, for debugging/documentation * @param filepath * @param reqCol * @param tableEnd * @param bounds * @param lineEnd End of line marker Pattern... set to null if the last column doesn't have a detectable pattern. * @param firstHdrLine Pattern to describe first line of heading * @param firstDataLine Pattern to describe first line of data * @throws java.io.IOException */ public PDFTable(String tableName, String filepath, int reqCol, Pattern lineEnd, String tableEnd, float[] bounds, String firstHdrLine, String firstDataLine) throws IOException { this.name = tableName; this.columnLimits = new TreeMap<>(); this.lineEnd = lineEnd; this.tableEnd = tableEnd; log.debug("PDFTable(\"" + tableName + "\")"); try (PDDocument document = PDDocument.load(filepath)) { LinedTableStripper stripper = new LinedTableStripper(bounds); log.info("Scanning first page of " + filepath); @SuppressWarnings("unchecked") List<PDPage> allPages = document.getDocumentCatalog().getAllPages(); PDPage pg = allPages.get(0); stripper.drawPage(pg); // Get the Y coordinates of the top and bottom of the table float[] tableBounds = stripper.findTable(firstHdrLine, firstDataLine); assert (tableBounds[0] != Float.MAX_VALUE && tableBounds[1] != Float.MAX_VALUE) : "Table header missing: " + tableBounds[0] + "-" + tableBounds[1]; log.debug("Table header found: " + tableBounds[0] + "-" + tableBounds[1]); log.trace("Boxes: " + stripper.boxes.size()); this.requiredColumn = reqCol; int edgeCount = 0; this.columnLimits.put(0f, edgeCount++); float retColumnNum = -1; for (Object o : stripper.boxes) { Rectangle2D.Float cell = (Rectangle2D.Float) o; log.trace("Cell (" + cell.x + ", " + cell.y + ", " + cell.width + ", " + cell.height); if (cell.y < tableBounds[0] || cell.y + cell.height > tableBounds[1]) { continue; } this.columnLimits.put(cell.x, edgeCount); if (edgeCount++ == requiredColumn) { retColumnNum = cell.x; } log.debug("Column boundary at " + cell.x); } this.returnColumnNum = retColumnNum; this.columnLimits.put(850f, edgeCount++); this.headerHeight = (int) (tableBounds[1] - bounds[0]); log.debug("Table header height: " + headerHeight); } } }
package input.pdf; import airspace.Zone; import java.io.File; import java.io.IOException; import java.lang.reflect.Array; import java.util.ArrayList; import java.util.List; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.util.PDFTextStripper; import org.apache.pdfbox.util.TextPosition; /** * Extract data from specified tables in the PDF file. * * The layout of each table is defined by a PDFTable object. * * Text in a cell may be wrapped across several lines, so Y-coordinates are not used to identify when a row begins or * end. The multiple lines of a wrapped cell are unwrapped and returned as a single String. To accomplish this, a 'full' * row is recognized either because the last column entry matches the 'lineEnd' Pattern, or because the 'requiredColumn' * column is not empty and the X-coordinate is now to the left of 'returnColumnNum'... this may be the right-hand side * of the first (or other) column, or the left-hand side of the 'requiredColumn' column. * * The end of the entire table is recognized by the 'lineEnd' String on a line by itself. * * Multiple tables with different layouts can be specified to be extracted consecutively from a single PDF file. * * A single table may span multiple pages, or there can be several tables vertically on one page. * * This is by no means a complete solution to extracting data tables from PDF files. * * @see PDFTable * * @author Frank van der Hulst <[email protected]> */ public class PDFTableStripper extends PDFTextStripper { private final static Logger log = Logger.getLogger(Zone.class.getName()); private final PDFTable[] tables; private final int[] pgTops; private final int[] pgBottoms; private String columns[]; private float bottomMargin; private float topMargin; private int tableNum; private PDFTable currentTable; private ArrayList<String[]> result; private final List<String[]>[] results; /** * * @param tops Height of header of each page. The last value given is used for any subsequent pages. * @param bottoms Height of footer of each page. The last value given is used for any subsequent pages. * @param tabs Table layouts * @throws IOException */ @SuppressWarnings({"unchecked"}) public PDFTableStripper(int[] tops, int[] bottoms, PDFTable... tabs) throws IOException { super(); setLineSeparator("\n"); setPageEnd("\f"); // setSortByPosition(true); // setSortByPosition DOES NOT WORK! pgTops = tops; pgBottoms = bottoms; tables = tabs; tableNum = 0; result = new ArrayList<>(0); results = (List<String[]>[]) Array.newInstance(result.getClass(), tables.length); currentTable = tables[tableNum]; results[tableNum] = result; startNewRow(); } /** * * @param tab Table layout * @throws IOException */ @SuppressWarnings("unchecked") public PDFTableStripper(PDFTable tab) throws IOException { super(); setLineSeparator("\n"); setPageEnd("\f"); this.setSortByPosition(true); final int[][] maxSize = {{0}, {850}}; pgTops = maxSize[0]; pgBottoms = maxSize[1]; topMargin = 0; bottomMargin = 850; tableNum = 0; result = new ArrayList<>(0); results = (List<String[]>[]) Array.newInstance(result.getClass(), 1); results[0] = result; currentTable = tab; tables = null; startNewRow(); } private void startNewRow() { columns = new String[currentTable.columnLimits.keySet().size() - 1]; for (int i = 0; i < columns.length; i++) { columns[i] = ""; } } private void endRow(String[] columns) { // First save any data... may be no data at start of page or end of table, in which case 'columns' will be null if (columns != null && !columns[currentTable.requiredColumn].isEmpty()) { result.add(columns); log.debug(columns[0] + ", " + columns[1] + ", " + columns[2] + (columns.length > 3 ?", " + columns[3] + (columns.length > 4 ? ", " + columns[4]:""):"")); } startNewRow(); } private float prevLine = -1000; private String wholeLine = ""; public void processTextPosition(float xPos, float yPos, char ch) { if (log.getLevel() == Level.TRACE) { System.out.print(ch); } // Skip page header & footer (footer may be parsed before body of page) if (yPos < topMargin || yPos > bottomMargin) { return; } if (ch == '\u00a0') { // Replace unbreakable space with ordinary space for ease of pattern matching ch = ' '; } else if (ch == '\u00ad') { // Replace long dash with ordinary minus for ease of pattern matching ch = '-'; } // Check for end-of-table delimiter. This may be at any X position, so checking needs to be // separate from column handling if (yPos > prevLine) { wholeLine = "" + ch; } else { wholeLine += ch; if (wholeLine.equals(currentTable.tableEnd)) { // End-of-table delimiter found... step to next table. Any other data on this line is ignored. endRow(columns); tableNum++; log.debug("Table " + tableNum + " delimiter at " + yPos); if (tableNum >= tables.length) { topMargin = bottomMargin + 1; return; } currentTable = tables[tableNum]; topMargin = yPos + currentTable.headerHeight; result = new ArrayList<>(0); results[tableNum] = result; startNewRow(); prevLine = yPos; return; } } prevLine = yPos; int columnNo = currentTable.columnLimits.floorEntry(xPos).getValue(); assert (columnNo < columns.length) : "Position " + xPos + " > " + columnNo + " too big"; // If the 'required' column is empty and this is to the right of the required column, then this is // actually part of the previous line if (columnNo > currentTable.requiredColumn && columns[currentTable.requiredColumn].isEmpty() && !result.isEmpty()) { // Add character to previous row result.get(result.size() - 1)[columnNo] += ch; return; } if (currentTable.lineEnd != null && currentTable.lineEnd.matcher(columns[columns.length - 1]).matches()) { // Full row detected because the end-of-line pattern was detected in the last column. endRow(columns); } else if (xPos < currentTable.returnColumnNum && !columns[currentTable.requiredColumn].isEmpty()) { // Some tables don't have a detectable end-of-line pattern. In that case, the 'lineEnd' value is null, // and a full row is detected if the 'required' column is not empty, and this X position is left of it... // i.e. it is assumed that the cursor has moved back left of the 'required' column. endRow(columns); } // Add character to appropriate column field. columns[columnNo] += ch; } /** * Called once for each page in the PDF document. * * @throws IOException */ @Override public void writePage() throws IOException { if (tableNum >= tables.length) { return; } int pageNo = getCurrentPageNo(); topMargin = pgTops[(pageNo <= pgTops.length ? pageNo : pgTops.length) - 1]; bottomMargin = pgBottoms[(pageNo <= pgBottoms.length ? pageNo : pgBottoms.length) - 1]; log.debug("Page " + pageNo + " bounds " + topMargin + "-" + bottomMargin); wholeLine = ""; @SuppressWarnings("UseOfObsoleteCollectionType") final java.util.Vector<List<TextPosition>> pageText = getCharactersByArticle(); for (List<TextPosition> l : pageText) { for (TextPosition tp : l) { processTextPosition(tp.getX(), tp.getY(), tp.getCharacter().charAt(0)); } } endRow(columns); } /** * Convenience method to parse the specified tables in the specified PDF file. * * @param file PDF file to read * @return Array of Lists -- one List per table. Each list contains one String[] entry for each row in the table. Each * row is an array of Strings, with one entry for each column in the table. * @throws IOException */ @SuppressWarnings("ReturnOfCollectionOrArrayField") public List<String[]>[] parse(File file) throws IOException { document = PDDocument.load(file); getText(document); document.close(); return results; } }
package input.pdf; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.util.PDFTextStripper; import org.apache.pdfbox.util.TextPosition; /** * * @author <a href="mailto:[email protected]">Frank van der Hulst</a> * @version $Revision: 1.00 $ */ public class TextStripper extends PDFTextStripper { private final static Logger log = Logger.getLogger(PDFTableStripper.class.getName()); protected final int topMargin; protected final int bottomMargin; protected final int lastPage; private final ArrayList<ArrayList<String>> results = new ArrayList<>(10); public TextStripper(PDDocument document, int top, int bottom, int numPages) throws IOException { super(); this.document = document; this.topMargin = top; this.bottomMargin = bottom; this.lastPage = numPages; setSortByPosition(true); } @Override public void writePage() throws IOException { int pageNo = getCurrentPageNo(); log.debug("Page " + pageNo + " bounds " + topMargin + "-" + bottomMargin); if (pageNo > lastPage) { return; } ArrayList<String> result = new ArrayList<>(60); results.add(result); float currentY = -1000; float nextX = 0; @SuppressWarnings("UseOfObsoleteCollectionType") final java.util.Vector<List<TextPosition>> pageText = getCharactersByArticle(); String line = ""; for (List<TextPosition> l : pageText) { for (TextPosition tp : l) { if (tp.getY() < topMargin || tp.getY() > bottomMargin) { continue; } if (Math.abs(tp.getY() - currentY) > tp.getHeight()/4) { if (!line.isEmpty()) { if (log.getLevel() == Level.TRACE) { System.out.println(line); } result.add(line); } line = tp.getCharacter(); currentY = tp.getY(); } else { float x = tp.getX(); if (x > nextX + tp.getWidthOfSpace()/4) { line += ' '; } line += tp.getCharacter(); } nextX = tp.getX() + tp.getWidth(); } } if (!line.isEmpty()) { result.add(line); } } /** * Parse the specified tables in the specified PDF file * * @return Array of Lists -- one List per table. Each list contains one String[] entry for each row in the table. Each * row is an array of Strings, with one entry for each column in the table. * @throws IOException */ @SuppressWarnings("ReturnOfCollectionOrArrayField") public ArrayList<ArrayList<String>> parse() throws IOException { getText(document); return results; } }
--------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]

