Re: PDF extraction

Frank van der Hulst Mon, 02 Feb 2015 10:54:56 -0800

On Tue, Feb 3, 2015 at 4:29 AM, Lorena Leishman <
[email protected]> wrote:


> Hi,
> I have a PDF that has information displayed on tables. Example:
> Company Name:   Barnes & Noble   Bank Of America  Macy'sAccount #:
>     123xxxxx              345xxxx               679xxxxStatus:
>       Open                    Closed                 OpenBalance:
>      $23.                      $0.00                    $100
> Is there a way with PDFbox to extract a specific value(s) from the table?
> Example: Bank Of America  and $0.00
> And also is there a way to cut the whole table and paste it into a
> different PDF?
> Please let me know, Thanks!
> Lorena

package input.pdf;

import com.vividsolutions.jts.geom.Coordinate;
import com.vividsolutions.jts.geom.LineString;
import java.awt.Dimension;
import java.awt.Toolkit;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Arrays;
import javafx.application.Platform;
import javafx.scene.Scene;
import javafx.scene.canvas.Canvas;
import javafx.scene.canvas.GraphicsContext;
import javafx.scene.layout.StackPane;
import javafx.scene.paint.Color;
import javafx.stage.Stage;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.geotools.data.simple.SimpleFeatureCollection;
import org.geotools.data.simple.SimpleFeatureIterator;
import output.ShapeStyle;
import topography.LineFeature;
/**
 *
 * @author Frank van der Hulst <[email protected]>
 */

public class Display1 {

  private final static Logger log = Logger.getLogger(Display1.class.getName());

  private final Dimension screen = Toolkit.getDefaultToolkit().getScreenSize();
  private StackPane root;
  private GraphicsContext gc;
  private Stage stage = null;
  private Canvas canvas;
  private double width, height, scale, xOffset, yOffset;

  /**
   * Can be called from FX thread
   *
   * @param title
   * @param pageWidth
   * @param pageHeight
   * @param xOffset
   * @param yOffset
   */
  @SuppressWarnings("SleepWhileInLoop")
  public Display1(final String title, double pageWidth, double pageHeight, double xOffset, double yOffset) {
    this.xOffset = xOffset;
    this.yOffset = yOffset;
    Platform.runLater(() -> {
      stage = new Stage();
      root = new StackPane();
      stage.setX(0);
      stage.setY(0);
      stage.setTitle(title);
      scale = (double) screen.height / pageHeight;
      width = pageWidth * scale;
      height = screen.height;
      Scene scene = new Scene(root, width, height);
      canvas = new Canvas(width, height);
      root.getChildren().add(canvas);
      stage.setWidth(width);
      stage.setHeight(height);
      stage.setScene(scene);
      stage.show();
      gc = canvas.getGraphicsContext2D();
    });
    int count = 0;
    while (gc == null) {
      try {
        Thread.sleep(100);
        count++;
      } catch (InterruptedException ex) {
      }
    }
    log.trace("Waited " + (count * 100) + "ms for graphics");
  }

  public void close() {
    // update display on FX thread
    Platform.runLater(stage::close);
    canvas = null;
    gc = null;
  }

  public Color javaFX(java.awt.Color awt) {
    return new javafx.scene.paint.Color(awt.getRed() / 255, awt.getGreen() / 255, awt.getBlue() / 255, awt.getAlpha() / 255);
  }

  public void drawPolyLine(final ArrayList<java.awt.Point.Float> L, final Color c, final float lw) {
    assert gc != null : "Null gc";
    if (L == null || L.isEmpty()) {
      return;
    }
//    log.debug("DrawLine: " + L.size());
    final int numPoints = L.size();
    final double[] x = new double[numPoints];
    final double[] y = new double[numPoints];
    int i = 0;
    for (java.awt.Point.Float P : L) {
      x[i] = P.x * scale;
      y[i++] = P.y * scale;
    }

    Platform.runLater(() -> {
      gc.setStroke(c);
      gc.setFill(null);
//      gc.setFill(lc);
      gc.setLineWidth(lw);
      if (numPoints < 256) {
        gc.strokePolygon(x, y, numPoints);
        return;
      }
      for (int i1 = 0; i1 < numPoints; i1 += 249) {
        final int numPts = Math.min(250, numPoints - i1);
        gc.strokePolyline(Arrays.copyOfRange(x, i1, i1 + numPts), Arrays.copyOfRange(y, i1, i1 + numPts), numPts);
      }
      gc.strokeLine(x[0], y[0], x[numPoints - 1], y[numPoints - 1]);
    });
  }

  public void drawSegment(float x1, float y1, float x2, float y2, final Color c, final float lw) {
    assert gc != null : "Null gc";
//    log.debug("DrawCell: " + L.size());
    final double[] x = {x1 * scale, x2 * scale};
    final double[] y = {y1 * scale, y2 * scale};

    Platform.runLater(() -> {
      gc.setStroke(c);
      gc.setLineWidth(lw);
      gc.strokePolygon(x, y, x.length);
    });
  }

  public void drawRectangle(Rectangle2D.Float area, final Color c, final float lw) {
    assert gc != null : "Null gc";
    if (area == null) {
      return;
    }
//    log.debug("DrawRectangle: " + area.toString());
    final double[] x = {area.x * scale, (area.x + area.width) * scale, (area.x + area.width) * scale, area.x * scale};
    final double[] y = {area.y * scale, area.y * scale, (area.y + area.height) * scale, (area.y + area.height) * scale};

    Platform.runLater(() -> {
      gc.setStroke(c);
//      gc.setFill(lc);
      gc.setLineWidth(lw);
      gc.strokePolygon(x, y, x.length);
    });
  }

  public void drawText(String str, Color c, float x, float y) {
//    log.debug("DrawText: " + c.toString() + str);
    assert gc != null : "Null gc";
    if (str == null || str.isEmpty()) {
      return;
    }

    Platform.runLater(() -> {
      gc.setFill(c);
      gc.fillText(str, x * scale, y * scale);
    });
  }

  public void minimize(boolean min) {
    if (min) {
      stage.hide();
    } else {
      stage.show();
    }
  }

  /**
   * Add a record for this zone
   *
   * @param src
   * @param style
   */
  public void drawFeatureCollection(SimpleFeatureCollection src, ShapeStyle style) {
    try (SimpleFeatureIterator it = src.features()) {
      while (it.hasNext()) {
        LineFeature local = new LineFeature(it.next());
        drawLineString((LineString) local.getDefaultGeometry(), style);
      }
    }
  }

  /**
   * Add a record for this zone
   *
   * @param style
   * @param ls
   */
  public void drawLineString(LineString ls, ShapeStyle style) {
    Coordinate[] L = ls.getCoordinates();
    assert gc != null : "Null gc";
    if (L == null || L.length == 0) {
      return;
    }
//    log.debug("DrawLine: " + L.size());
    final int numPoints = L.length;
    final double[] x = new double[numPoints];
    final double[] y = new double[numPoints];
    int i = 0;
    for (Coordinate P : L) {
      x[i] = (P.x - xOffset) * scale;
      y[i++] = (P.y - yOffset) * scale;
    }

    Platform.runLater(() -> {
      gc.setStroke(Color.BLUEVIOLET);
      gc.setFill(Color.BLUEVIOLET);
      gc.setLineWidth(1);
      if (numPoints < 256) {
        gc.strokePolyline(x, y, numPoints);
        return;
      }
      for (int i1 = 0; i1 < numPoints; i1 += 249) {
        final int numPts = Math.min(250, numPoints - i1);
        gc.strokePolyline(Arrays.copyOfRange(x, i1, i1 + numPts), Arrays.copyOfRange(y, i1, i1 + numPts), numPts);
      }
    });
  }
}

package input.pdf;
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.awt.Dimension;
import java.awt.Image;
import java.awt.Point;
import java.awt.geom.AffineTransform;
import java.awt.geom.GeneralPath;
import java.awt.geom.PathIterator;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.NavigableSet;
import java.util.SortedSet;
import java.util.TreeSet;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdfviewer.PageDrawer;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.util.PDFOperator;
import org.apache.pdfbox.util.TextPosition;

/**
 * Scan a PDF document page by page, and extract a table delimited by lines of text that match the specified delimiters.
 *
 * @author <a href="mailto:[email protected]">Frank van der Hulst</a>
 * @version $Revision: 1.00 $
 */
public class LinedTableStripper extends PageDrawer {

  private final static Logger log = Logger.getLogger(LinedTableStripper.class.getName());

  /**
   * Top and bottom Y coordinates of the range to be scanned for the table, to exclude page headers and footers. If
   * null, all of each page is scanned.
   *
   */
  protected float[] pageMargins;

  /**
   * ArrayList of rows, one for each row in the result table. Each row consists of an ArrayList of Strings. Rows may not
   * all be the same size. Phantom rows may be generated which contain nothing but empty Strings.
   */
  private ArrayList<ArrayList<String>> table;
  private TreeSet<MyTextPosition> textItems = null;
  @SuppressWarnings("PackageVisibleField")
  TreeSet<TableCell> boxes = null;
  private TreeSet<TableCell> horizLines = null;
  private TreeSet<TableCell> vertLines = null;
  private float pageWidth, pageHeight, rotation;

  /**
   * Default constructor.
   *
   * @param pageMargins Y values on each page outside which text will be ignored, intended to exclude page headers and
   * footers. May be null, in which case all text is scanned.
   * @throws IOException If there is an error loading properties from the file.
   */
  public LinedTableStripper(float[] pageMargins) throws IOException {
    super();
    assert pageMargins == null || pageMargins[0] < pageMargins[1];
    this.pageMargins = pageMargins;
    table = new ArrayList<>(0);
  }

  /**
   * Main entry point: Scan one page in the document.
   *
   * First populates textItems, boxes, vertLines, horizLines via calls to processTextPosition(), fillPath(),
   * strokePath(), drawImage(), shFill() methods from processStream() via drawPage()
   *
   * The next phase is to find the table in the extracted text by searching for lines which match the specified
   * patterns.
   *
   * Finally, if the table has been found, the vertical lines in the table are used to separate the text into table
   * cells.
   *
   * @param p The page to scan.
   * @param delimiter0 String containing regex pattern to identify the beginning of the table. Required.
   * @param delimiter1 String containing regex pattern to identify the end of the table. Required.
   * @return Result table
   *
   * @throws IOException If there is an IO error while drawing the page.
   */
  @SuppressWarnings("ReturnOfCollectionOrArrayField")
  public ArrayList<ArrayList<String>> parsePage(PDPage p, String delimiter0, String delimiter1) throws IOException {
    drawPage(p);
    // Scan textItems for the delimiter lines
    float[] tableBounds = findTable(delimiter0, delimiter1);
    if (tableBounds[0] == Float.MAX_VALUE || tableBounds[1] == Float.MAX_VALUE) {
      return null;
    }
    log.debug("Table found: " + tableBounds[0] + "-" + tableBounds[1]);
    extractTable(tableBounds);
    return table;
  }

  @Override
  public void processOperator(String operation, List<COSBase> arguments) throws IOException {
    log.trace("processOperator1: " + operation + ", " + arguments.toString());
    super.processOperator(operation, arguments);
  }

  @Override
  protected void processOperator(PDFOperator operator, List<COSBase> arguments) throws IOException {
    log.trace("processOperator2: " + operator.toString() + ", " + operator.getOperation() + ", " + arguments.toString());
    // Ignore Resources object operator... this is used to draw embedded images, and not needed for text operations
    if (operator.getOperation().equals("Do")) {
      return;
    }
    super.processOperator(operator, arguments);
  }

  /**
   * Scan one page in the document.
   *
   * First populates textItems, boxes, vertLines, horizLines via calls to processTextPosition(), fillPath(),
   * strokePath(), drawImage(), shFill() methods from processStream().
   *
   * In the PDF document itself, there is no requirement for the objects to be in any order... i.e. they do not need to
   * be sorted top to bottom. So testItems, boxes, horizLines, and vertLines are SORTED collections, to provide the data
   * needed by findTable() and extractTable() in the right order
   *
   * Named drawPage to indicate its functional similarity to the drawPage method in PageDrawer.
   *
   * @param p The page to scan.
   *
   * @throws IOException If there is an IO error while drawing the page.
   */
  @SuppressWarnings("SuspiciousNameCombination")
  public void drawPage(PDPage p) throws IOException {
    // Only process the page if there is some content
    // Otherwise we are done here and return -- the result table is still empty
    if (p.getContents() == null) {
      return;
    }
    log.debug("drawPage()");
    textItems = new TreeSet<>();
    boxes = new TreeSet<>();
    horizLines = new TreeSet<>();
    vertLines = new TreeSet<>();

    page = p;
    rotation = p.getRotation();
    PDRectangle cropBox = p.findCropBox();
    if (rotation == 90) {
      pageWidth = cropBox.getHeight();
      pageHeight = cropBox.getWidth();
    } else {
      pageWidth = cropBox.getWidth();
      pageHeight = cropBox.getHeight();
    }
    pageSize = new Dimension((int) pageWidth, (int) pageHeight); // Unused directly in this class, but required by PageDrawer
    log.debug("Page size: " + pageSize.toString() + ", rotation: " + rotation);

    // Extract entire page contents to textResult, cellResult, horizLines, vertLines
    // These are sets, sorted by Y position and X position
    processStream(p, p.findResources(), p.getContents().getStream());
    log.debug("Scan finished: Boxes: " + boxes.size() + ", Vert " + vertLines.size() + ", Horiz " + horizLines.size()
            + ", Text " + textItems.size());
  }

  /**
   * Find a table matching the criteria on the specified page.
   *
   * Run once for each page in the document. This expects that drawPage() has run and populated the textItems sorted
   * collections.
   *
   * May be overridden if there is some other mechanism to identify the top and bottom of the table.
   *
   * @param delimiter0
   * @param delimiter1
   * @return a table of 2 entries, being the Y coordinates of the top and bottom of the table. If either is not found,
   * that entry in the table Float.MAX_VALUE
   */
  protected float[] findTable(String delimiter0, String delimiter1) {
    log.debug("findTable()");
    assert !delimiter0.isEmpty() && !delimiter1.isEmpty();
    float[] tableBounds = {Float.MAX_VALUE, Float.MAX_VALUE};
    // Scan through the text for the specified delimiter text
    float currentY = -1000;
    String line = "";
    for (MyTextPosition tp : textItems) {
      if (Math.abs(tp.tp.getY() - currentY) > tp.tp.getHeight() / 4) {
        if (!line.isEmpty()) {
          if (log.getLevel() == Level.TRACE) {
            System.out.println(line);
          }
          if (line.matches(delimiter0) && currentY < tableBounds[0]) {
            tableBounds[0] = currentY;
            log.debug("Delimiter 0 found at " + currentY);
          }
          if (line.matches(delimiter1) && currentY < tableBounds[1]) {
            tableBounds[1] = currentY;
            log.debug("Delimiter 1 found at " + currentY);
          }
        }
        line = tp.tp.getCharacter();
        currentY = tp.tp.getY();
      } else {
        line += tp.tp.getCharacter();
      }
    }
    if (!line.isEmpty()) {
      if (line.matches(delimiter0) && currentY < tableBounds[0]) {
        tableBounds[0] = currentY;
      }
      if (line.matches(delimiter1) && currentY < tableBounds[1]) {
        tableBounds[1] = currentY;
      }
    }
    return tableBounds;
  }

  /**
   * Extract text divided at cell boundaries, and build the table.
   *
   * Creates a list of rectangle cell boundaries. The text corresponding to each cell is extracted from textItems and
   * inserted into the appropriate row and column of the result table.
   *
   * Where a cell contains multiple lines of text, they will be separated by newlines. Whitespace within each cell is
   * ignored, unless specified by an actual space character.
   *
   * The text for each cell is then inserted into a row, which is in the table table.
   *
   * This requires that the 'textItems' and the 'vertLines' sets both contain data. These have been populated earlier by
   * processStream().
   *
   * This particular method uses *only* vertical lines to determine the limits of a cell. I.e. it assumes that every
   * cell has a separate vertical line on either side of it. This is convenient because it allows easy left-to-right,
   * top-to-bottom processing of the text into rows and columns.
   *
   * Assumes that the table spans the entire width of the page. However, text to the left of the first vertical line is
   * ignored, as is text to the right of the last vertical line.
   *
   * This method may be overridden to handle tables whose cell boundaries are defined differently.
   *
   * @param tableBounds Y coordinates of the table boundaries.
   */
  protected void extractTable(float[] tableBounds) {
    assert textItems != null && vertLines != null;
    // Extract subset of the page's vertical lines that relates to the table
    TableCell tableStart = new TableCell(0, tableBounds[0]);
    TableCell tableEnd = new TableCell(0, tableBounds[1]);
    NavigableSet<TableCell> tableCells = vertLines.subSet(tableStart, true, tableEnd, true);
    log.debug("Vert lines in table = " + tableCells.size() + ", Text items = " + textItems.size());

    // Convert vertical lines to rectangles, one for each cell
    // Requires that the tableCells and textResult be ordered top to bottom and left to right
    Iterator<TableCell> it = tableCells.iterator();
    float prevCellEnd = pageWidth;
    float prevCellY = 0;
    ArrayList<String> row = null;
    while (it.hasNext()) {
      TableCell c = it.next();
      c.width = c.x - prevCellEnd;
      prevCellEnd = c.x;
      if (c.width < 0) {
        // c.width < 0 means that the new vertical line is to the left of the previous one. 
        // Because lines are sorted top-to-bottom, left-to-right, this means that it is the beginning of a new row
        // This vertical line is the lefthand end, so it doesn't represent a cell
        continue;
      }
      // This vertical line is the right-hand edge of a cell... calculate the corresponding left-hand edge
      // Height is the height of the vertical line, which should be the same height as the previous line
      c.x -= c.width;
      log.trace("Cell " + c.x + ", " + c.y + ", " + c.width + ", " + c.height);

      // Now extract the text relating to this cell from the sorted textItems set
      float currentY = -1000;
      float nextX = 0;
      MyTextPosition start = new MyTextPosition(c.x, c.y);
      MyTextPosition end = new MyTextPosition(c.x + c.width, c.y + c.height);
      SortedSet<MyTextPosition> cellText = textItems.subSet(start, end);
      log.debug("Cell text size: " + cellText.size());

      // The text will be in the right order, left-to-right, top-to-bottom, but will also include text for
      // other cells on the same line... probably there's a more efficient way to deal with this than throwing away
      // all the ones outside this cell, but that's what happens here
      String text = "";
      for (MyTextPosition tp : cellText) {
        float x = tp.tp.getX();
        // Characters will leak through if they are on the same line, even if their X coordinates are out of range,
        // because textItems is first sorted on Y coordinates
        if (x > c.x && x < c.x + c.width) {
          if (!text.isEmpty()) {
            if (tp.tp.getY() - currentY > tp.tp.getHeight() / 4) {
              // Significant change in Y coordinate is interpreted as word-wrap
              text += '\n';
              // Inserting spaces based on X coordinates is extremely uncertain, so it isn't done!
              // Depending on kerning and other factors, characters may be closer than tp.getWidthOfSpace(), yet 
              // visually will appear as a space.
//            } else if (x > nextX + tp.getWidthOfSpace()) {
//              c.text += ' ';
            } else if (x < nextX) {
              // Bold characters are represented by printing the same character twice, slightly offset
              continue;
            }
          }
          // Replace nonbreaking space character with a space, for convenience of pattern matching and other
          // text processing
          text += tp.tp.getCharacter().replaceAll("\\u00a0", " ");
          currentY = tp.tp.getY();
          nextX = x + tp.tp.getWidth() / 2;
          log.trace("Cell text: '" + text.replaceAll("\\n", "\\\\n") + "'");
        }
      }

      // Finally, put the text into the appropriate cell in the output table
      log.debug("Cell " + c.y + ", " + prevCellY + ", " + text.replaceAll("\\n", "\\\\n"));
      if (c.y > prevCellY + 1) {
        // New row of cells found
        prevCellY = c.y;
        row = new ArrayList<>(0);
        table.add(row);
      }
      assert row != null;
      row.add(text);
    }
  }

  /**
   * Overrides the text drawing method to instead collect text and add it to the sorted list
   *
   * @param text The text to process
   */
  @Override
  protected void processTextPosition(TextPosition text) {
    // the 0,0-reference has to be moved from the lower left (PDF) to the upper left (AWT-graphics)
    // to match the coordinate system of graphic lines
    // text.getY() manages page rotations, text.getTextPos().getYPosition() does not
    float y = text.getY();//pageHeight - text.getTextPos().getYPosition();
    if (pageMargins != null && (y > pageMargins[1] || y + text.getHeight() < pageMargins[0])) {
      return;
    }
    textItems.add(new MyTextPosition(text));
  }

  /**
   * Overrides the rectangle filling method to collect graphic lines into sorted sets.
   *
   * @param windingRule The winding rule this path will use.
   *
   */
  @Override
  public void fillPath(int windingRule) {
    GeneralPath path = getLinePath();
    Rectangle2D box = path.getBounds2D();
    if (pageMargins == null || (box.getMaxY() > pageMargins[0] && box.getMinY() < pageMargins[1])) {
      log.trace("fillPath " + box.getMinY() + ", " + box.getMaxY());
      split(path);
    }
  }

  /**
   * Overrides the line drawing method to collect graphic lines into sorted sets
   *
   */
  @Override
  public void strokePath() {
    GeneralPath path = getLinePath();
    Rectangle2D box = path.getBounds2D();
    if (pageMargins == null || (box.getMaxY() > pageMargins[0] && box.getMinY() < pageMargins[1])) {
      log.trace("strokePath " + box.getMinY() + ", " + box.getMaxY());
      split(path);
    }
  }

  /**
   * Overrides the area shading method to collect graphic lines into sorted sets Called by SHFill operator.
   *
   * @param shadingName The name of the Shading Dictionary to use for this fill instruction.
   *
   */
  @Override
  public void shFill(COSName shadingName) {
    //Fill with Shading.
    log.trace("Shading = " + shadingName.toString());
    GeneralPath path = new GeneralPath(getGraphicsState().getCurrentClippingPath());
    Rectangle2D box = path.getBounds2D();
    if (pageMargins != null && (box.getMaxY() < pageMargins[0] || box.getMinY() > pageMargins[1])) {
      return;
    }
    log.debug("shFill " + box.getMinY() + ", " + box.getMaxY());
    split(path);
  }

  /**
   * Add a rectangle to the appropriate sorted list (horizLines, vertLines, boxResult).
   *
   * @param cell
   */
  private void addCell(TableCell cell) {
    log.trace(cell.toString());
    if (pageMargins != null && (cell.y > pageMargins[1] || cell.y + cell.height < pageMargins[0])) {
      return;
    }
    if (cell.height < 2) {
      horizLines.add(cell);
    } else if (cell.width < 2) {
      vertLines.add(cell);
    } else {
      boxes.add(cell);
    }
    log.trace("cell added " + cell.toString());
  }

  /**
   * Split the path into several separate rectangles, identified by SEG_MOVETO operations.
   *
   * A single 'path' may include multiple objects. The beginning of a new object is identified by a "move to" command.
   * Each object is assumed to be a rectangle, so the path itself is discarded and just the boundary rectangle is
   * retained.
   *
   * @param path The source path to split
   */
  protected void split(GeneralPath path) {
    PathIterator pathIt = path.getPathIterator(null);
    if (pathIt.isDone()) {
      return;
    }
    log.trace("split " + path.getBounds2D());
    float[] coords = new float[6];
    Point.Float openPoint = null;
    TableCell item = null;
    while (!pathIt.isDone()) {
      int type = pathIt.currentSegment(coords);
      if (rotation == 90) {
        float temp = coords[0];
        coords[0] = pageWidth - coords[1] - 250;
        coords[1] = temp;
      }
      Point.Float newPoint = new Point.Float(coords[0], coords[1]);
      log.trace("" + type + " " + (type == PathIterator.SEG_CLOSE ? "" : newPoint));
      switch (type) {
        case PathIterator.SEG_MOVETO: {
          if (item != null && !item.isEmpty()) {
            addCell(item);
            item = null;
          }
          openPoint = newPoint;  // Save for next SEG_CLOSE
          break;
        }
        // Treat curved lines as straight lines... they shouldn't be here anyway
        case PathIterator.SEG_CUBICTO:
        case PathIterator.SEG_QUADTO:
        case PathIterator.SEG_LINETO: {
          break;
        }
        case PathIterator.SEG_CLOSE: {
          newPoint = openPoint;
          break;
        }
      }
      if (item == null) {
        assert newPoint != null;
        // Create a new rectangle to start the new shape
        item = new TableCell(newPoint.x, newPoint.y);
      } else {
        // Expand the boundary of this rectangle to include the new point
        item.add(newPoint);
      }
      pathIt.next();
    }
    if (item != null && !item.isEmpty()) {
      addCell(item);
    }
    log.trace("results = " + vertLines.size());
  }

  /**
   * Overrides the image drawing method to ignore images.
   *
   * If images are wanted, this could be overridden to collect them into a sorted set.
   *
   * @param awtImage The image to draw.
   * @param at The transformation to use when drawing.
   *
   */
  @Override
  public void drawImage(Image awtImage, AffineTransform at) {
  }

  /**
   * Wrapper to sort Rectangles according to their location on the page. They are sorted according to their top edge,
   * from top to bottom. Where two rectangles have the same top edge, they are sorted according to their left edge.
   * Where two rectangles have the same top-left point, they are sorted by height, then width (this is irrelevant for
   * the table stripper, and just for completeness).
   *
   */
  private class TableCell extends Rectangle2D.Float implements Comparable<TableCell> {

    private static final long serialVersionUID = 1L;

    TableCell(float x, float y) {
      super(x, y, 1, 1);
    }

    @Override
    public int compareTo(TableCell other) {
      if (this == other) {
        return 0;
      }
      if (y != other.y) {
        return java.lang.Float.compare(y, other.y);
      }
      if (x != other.x) {
        return java.lang.Float.compare(x, other.x);
      }
      if (height != other.height) {
        return java.lang.Float.compare(height, other.height);
      }
      return java.lang.Float.compare(width, other.width);
    }

    @Override
    public String toString() {
      return ("" + x + ", " + y + ", " + width + ", " + height);
    }
  }

  /**
   * Wrapper to sort TextPositions according to their location on the page. They are sorted according to their Y
   * coordinate, from top to bottom. Where two rectangles have the same Y coordinate, they are sorted according to their
   * X coordinate. This doesn't work well for superscripts (sorted ahead of normal text on the same line) and subscripts
   * (sorted after normal text). Quantizing Y coordinates into text lines would solve that.
   *
   * Extending TextPosition makes it complex to get to the x & y coordinates, so just embed TextPosition in this class.
   *
   */
  private class MyTextPosition implements Comparable<MyTextPosition> {

    final TextPosition tp;
    final private float x, y;

    MyTextPosition(TextPosition tp) {
      this.tp = tp;
      y = tp.getY();
      x = tp.getX();
    }

    /**
     * Dummy constructor for setting the limits of a subset extraction
     */
    MyTextPosition(float x, float y) {
      this.x = x;
      this.y = y;
      this.tp = null;
    }

    @Override
    @SuppressWarnings("AccessingNonPublicFieldOfAnotherObject")
    public int compareTo(MyTextPosition other) {
      if (other.y != y) {
        return Float.compare(y, other.y);
      }
      return Float.compare(x, other.x);
    }

    @Override
    public String toString() {
      return (tp.getCharacter() + " " + ", (" + x + ", " + y + ")");
    }
  }
}

package input.pdf;

import airspace.Zone;
import java.io.IOException;
import java.lang.reflect.Array;
import java.util.ArrayList;
import java.util.List;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;

/**
 *
 * @author Frank van der Hulst <[email protected]>
 */
public class PdfPlateStripper extends PDFTextStripper {

  private final static Logger log = Logger.getLogger(PdfPlateStripper.class.getName());

  protected String columns[];
  protected String prevBoundId = null;
  protected float currentLine;
  protected float bottomOfTable;
  protected float topOfTable;
  protected Zone prevZone = null;
  protected final int[] pgTops;
  protected final int[] pgBottoms;
  protected PDFTable currentTable;
  protected ArrayList<String[]> result;
  private final List<String[]>[] results;
  private final PDFTable[] tables;
  private int tableNum;

  /**
   * @param tops
   * @param bottoms
   * @param tabs
   * @throws IOException
   */
  @SuppressWarnings({"unchecked"})
  public PdfPlateStripper(int[] tops, int[] bottoms, PDFTable[] tabs) throws IOException {
    super();
    setLineSeparator("\n");
    setPageEnd("\f");
    this.setSortByPosition(true);

    pgTops = tops;
    pgBottoms = bottoms;
    tables = tabs;

    tableNum = 0;
    result = new ArrayList<>(0);
    results = (List<String[]>[]) Array.newInstance(result.getClass(), tables.length);
    currentTable = tables[tableNum];
    results[tableNum] = result;
    startNewRow();
  }

  private void startNewRow() {
    columns = new String[currentTable.columnLimits.keySet().size() - 1];
    for (int i = 0; i < columns.length; i++) {
      columns[i] = "";
    }
  }

  private void endRow(String[] columns) {
    // First save any data... may be no data at start of page or end of table, in which case 'columns' will be null
    if (columns != null && !columns[currentTable.requiredColumn].isEmpty()) {
      result.add(columns);
    }
    log.trace(result.size() + ": " + columns[0]);
    startNewRow();
  }

  /**
   * Called once for each page in the PDF document.
   *
   * @throws IOException
   */
  @Override
  public void writePage() throws IOException {
    if (tableNum >= tables.length) {
      return;
    }
    int pageNo = getCurrentPageNo();
    if (pageNo > 1) {
      return;
    }
    float prevLine = -1000;
    String wholeLine = "";

    topOfTable = pgTops[(pageNo <= pgTops.length ? pageNo : pgTops.length) - 1];
    bottomOfTable = pgBottoms[(pageNo <= pgBottoms.length ? pageNo : pgBottoms.length) - 1];
    log.debug("Page " + pageNo + " bounds " + topOfTable + "-" + bottomOfTable);
    @SuppressWarnings("UseOfObsoleteCollectionType")
    final java.util.Vector<List<TextPosition>> pageText = getCharactersByArticle();
    for (List<TextPosition> l : pageText) {
      for (TextPosition tp : l) {
        // Replace unbreakable space with ordinary space for ease of pattern matching
        char ch = tp.getCharacter().charAt(0);
        float yPos = tp.getY();
        float xPos = tp.getX();
        if (ch == '\u00a0') {
          ch = ' ';
        }

        // Skip page header & footer (footer may be parsed before body of page)
        if (yPos < topOfTable || yPos > bottomOfTable) {
          continue;
        }

        // Check for end-of-table delimiter. This may be at any X position, so checking needs to be
        // separate from column handling
        if (yPos > prevLine) {
          wholeLine = "" + ch;
          if (log.getLevel() == Level.TRACE) {
            System.out.print("\n" + ch);
          }
        } else {
          if (log.getLevel() == Level.TRACE) {
            System.out.print(ch);
          }
          wholeLine += ch;
          if (wholeLine.equals(currentTable.tableEnd)) {
            // End-of-table delimiter found... step to next table. Any other data on this line is ignored.
            endRow(columns);
            tableNum++;
            log.debug("Table " + tableNum + " delimiter at " + yPos);
            if (tableNum >= tables.length) {
              topOfTable = bottomOfTable + 1;
              return;
            }
            currentTable = tables[tableNum];
            topOfTable = yPos + currentTable.headerHeight;
            result = new ArrayList<>(0);
            results[tableNum] = result;
            startNewRow();
            prevLine = yPos;
            continue;
          }
        }
        prevLine = yPos;

        int columnNo = currentTable.columnLimits.floorEntry(xPos).getValue();
        assert (columnNo < columns.length) : "Position " + xPos + " > " + columnNo + " too big";
        // If the 'required' column is empty and this is to the right of the required column, then this is 
        // actually part of the previous line
        if (columnNo > currentTable.requiredColumn
                && columns[currentTable.requiredColumn].isEmpty()
                && !result.isEmpty()) {
          // Add character to previous row
          result.get(result.size() - 1)[columnNo] += ch;
          continue;
        }

        if (currentTable.lineEnd != null && currentTable.lineEnd.matcher(columns[columns.length - 1]).matches()) // Full row detected because the end-of-line pattern was detected in the last column.
        {
          endRow(columns);
        } else if (xPos < currentTable.returnColumnNum && !columns[currentTable.requiredColumn].isEmpty()) // Some tables don't have a detectable end-of-line pattern. In that case, the 'lineEnd' value is null,
        // and a full row is detected if the 'required' column is not empty, and this X position is left of it...
        // i.e. it is assumed that the cursor has moved back left of the 'required' column.
        {
          endRow(columns);
        }
        // Add character to appropriate column field.
        columns[columnNo] += ch;
      }
    }
    endRow(columns);
  }

  /**
   * Parse the specified tables in the specified PDF file
   *
   * @param filepath Full path to file
   * @return Array of Lists -- one List per table. Each list contains one String[] entry for each row in the table. Each
   * row is an array of Strings, with one entry for each column in the table.
   * @throws IOException
   */
  @SuppressWarnings("ReturnOfCollectionOrArrayField")
  public List<String[]>[] parse(String filepath) throws IOException {
    document = PDDocument.load(filepath);
    getText(document);
    document.close();
    return results;
  }
}

package input.pdf;

import airspace.Zone;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.util.List;
import java.util.NavigableMap;
import java.util.TreeMap;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;

/**
 * Define how a table is laid out, and rules for identifying the end of a row.
 *
 * The column X-coordinate locations are specified by the caller.
 *
 * Text in a cell may be wrapped across several lines, so Y-coordinates are not used to identify when a row begins or
 * end. The multiple lines of a wrapped cell are unwrapped and returned as a single String. To accomplish this, a 'full'
 * row is recognized either because the last column entry matches the 'lineEnd' Pattern, or because the 'requiredColumn'
 * column is not empty and the X-coordinate is now to the left of 'returnColumnNum'... this may be the right-hand side
 * of the first (or other) column, or the left-hand side of the 'requiredColumn' column.
 *
 * The end of the entire table is recognized by the fact that the 'lineEnd' String is present on a line by itself.
 *
 * Multiple tables with different layouts can be specified to be extracted consecutively from a single PDF file.
 *
 * A single table may span multiple pages, or there can be several tables vertically on one page.
 *
 * @author Frank van der Hulst <[email protected]>
 */
public class PDFTable {

  protected final static Logger log = Logger.getLogger(Zone.class.getName());
  public final String name;
  final NavigableMap<Float, Integer> columnLimits;
  final int headerHeight;
  final Pattern lineEnd;
  final String tableEnd;
  final float returnColumnNum;
  final int requiredColumn;

  /**
   * Construct an object to represent how a table is laid out in the PDF document
   *
   * @param tableName Identifier/descriptor of table, for debugging/documentation
   * @param returnX If the X-coordinate of the first item of a line is less than this, and the previous line's
   * 'required' column is not empty, it indicates that the previous line is full. Must be less than the left-hand edge
   * of the 'required' column. PDF X-coordinate.
   * @param reqCol Required column number. Only if this column is not empty will the line be considered to be full.
   * @param headerHeight Height of this table's headers in PDF Y-coordinates. This amount of space is skipped over at
   * the start of the table, and each page.
   * @param lineEnd End of line marker Pattern... set to null if the last column doesn't have a detectable pattern.
   * @param tableEnd End of table delimiter String... may not be null or empty. When this String is found on a line by
   * itself, the table is finished and the next table (if any) begun
   * @param columnLimits Limits of columns to extract... first entry should be 0, last entry should be max page width,
   * in PDF X-coordinates
   */
  public PDFTable(String tableName, int returnX, int reqCol, int headerHeight, Pattern lineEnd, String tableEnd, float... columnLimits) {
    this.name = tableName;
    this.columnLimits = new TreeMap<>();
    for (int i = 0; i < columnLimits.length; i++) {
      this.columnLimits.put(columnLimits[i], i);
    }
    this.returnColumnNum = returnX;
    this.requiredColumn = reqCol;
    this.headerHeight = headerHeight;
    this.lineEnd = lineEnd;
    this.tableEnd = tableEnd;
    assert tableEnd != null && !tableEnd.isEmpty();
    assert columnLimits.length > requiredColumn + 1 : "requiredColumn too big: " + requiredColumn;
    assert returnX <= columnLimits[requiredColumn] : "returnX too big: " + returnX + " vs " + columnLimits[requiredColumn];
  }

  /**
   * Construct a representation of how a table is laid out in the PDF document. The actual column limits are calculated
   * by locating the table heading and extracting boxes from that.
   *
   * @param tableName Identifier/descriptor of table, for debugging/documentation
   * @param filepath
   * @param reqCol
   * @param tableEnd
   * @param bounds
   * @param lineEnd End of line marker Pattern... set to null if the last column doesn't have a detectable pattern.
   * @param firstHdrLine Pattern to describe first line of heading
   * @param firstDataLine Pattern to describe first line of data
   * @throws java.io.IOException
   */
  public PDFTable(String tableName, String filepath, int reqCol, Pattern lineEnd, String tableEnd, float[] bounds, String firstHdrLine, String firstDataLine) throws IOException {
    this.name = tableName;
    this.columnLimits = new TreeMap<>();
    this.lineEnd = lineEnd;
    this.tableEnd = tableEnd;
    log.debug("PDFTable(\"" + tableName + "\")");
    try (PDDocument document = PDDocument.load(filepath)) {
      LinedTableStripper stripper = new LinedTableStripper(bounds);
      log.info("Scanning first page of " + filepath);
      @SuppressWarnings("unchecked")
      List<PDPage> allPages = document.getDocumentCatalog().getAllPages();
      PDPage pg = allPages.get(0);
      stripper.drawPage(pg);
      // Get the Y coordinates of the top and bottom of the table
      float[] tableBounds = stripper.findTable(firstHdrLine, firstDataLine);
      assert (tableBounds[0] != Float.MAX_VALUE && tableBounds[1] != Float.MAX_VALUE) :
              "Table header missing: " + tableBounds[0] + "-" + tableBounds[1];
      log.debug("Table header found: " + tableBounds[0] + "-" + tableBounds[1]);
      log.trace("Boxes: " + stripper.boxes.size());
      this.requiredColumn = reqCol;
      int edgeCount = 0;
      this.columnLimits.put(0f, edgeCount++);
      float retColumnNum = -1;
      for (Object o : stripper.boxes) {
        Rectangle2D.Float cell = (Rectangle2D.Float) o;
        log.trace("Cell (" + cell.x + ", " + cell.y + ", " + cell.width + ", " + cell.height);
        if (cell.y < tableBounds[0] || cell.y + cell.height > tableBounds[1]) {
          continue;
        }
        this.columnLimits.put(cell.x, edgeCount);
        if (edgeCount++ == requiredColumn) {
          retColumnNum = cell.x;
        }
        log.debug("Column boundary at " + cell.x);
      }
      this.returnColumnNum = retColumnNum;
      this.columnLimits.put(850f, edgeCount++);
      this.headerHeight = (int) (tableBounds[1] - bounds[0]);
      log.debug("Table header height: " + headerHeight);
    }
  }
}

package input.pdf;

import airspace.Zone;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.Array;
import java.util.ArrayList;
import java.util.List;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;

/**
 * Extract data from specified tables in the PDF file.
 *
 * The layout of each table is defined by a PDFTable object.
 *
 * Text in a cell may be wrapped across several lines, so Y-coordinates are not used to identify when a row begins or
 * end. The multiple lines of a wrapped cell are unwrapped and returned as a single String. To accomplish this, a 'full'
 * row is recognized either because the last column entry matches the 'lineEnd' Pattern, or because the 'requiredColumn'
 * column is not empty and the X-coordinate is now to the left of 'returnColumnNum'... this may be the right-hand side
 * of the first (or other) column, or the left-hand side of the 'requiredColumn' column.
 *
 * The end of the entire table is recognized by the 'lineEnd' String on a line by itself.
 *
 * Multiple tables with different layouts can be specified to be extracted consecutively from a single PDF file.
 *
 * A single table may span multiple pages, or there can be several tables vertically on one page.
 *
 * This is by no means a complete solution to extracting data tables from PDF files.
 *
 * @see PDFTable
 *
 * @author Frank van der Hulst <[email protected]>
 */
public class PDFTableStripper extends PDFTextStripper {

  private final static Logger log = Logger.getLogger(Zone.class.getName());
  private final PDFTable[] tables;
  private final int[] pgTops;
  private final int[] pgBottoms;
  private String columns[];
  private float bottomMargin;
  private float topMargin;
  private int tableNum;
  private PDFTable currentTable;
  private ArrayList<String[]> result;
  private final List<String[]>[] results;

  /**
   *
   * @param tops Height of header of each page. The last value given is used for any subsequent pages.
   * @param bottoms Height of footer of each page. The last value given is used for any subsequent pages.
   * @param tabs Table layouts
   * @throws IOException
   */
  @SuppressWarnings({"unchecked"})
  public PDFTableStripper(int[] tops, int[] bottoms, PDFTable... tabs) throws IOException {
    super();
    setLineSeparator("\n");
    setPageEnd("\f");
//    setSortByPosition(true);  // setSortByPosition DOES NOT WORK!

    pgTops = tops;
    pgBottoms = bottoms;
    tables = tabs;
    tableNum = 0;
    result = new ArrayList<>(0);
    results = (List<String[]>[]) Array.newInstance(result.getClass(), tables.length);
    currentTable = tables[tableNum];
    results[tableNum] = result;
    startNewRow();
  }

  /**
   *
   * @param tab Table layout
   * @throws IOException
   */
  @SuppressWarnings("unchecked")
  public PDFTableStripper(PDFTable tab) throws IOException {
    super();
    setLineSeparator("\n");
    setPageEnd("\f");
    this.setSortByPosition(true);
    final int[][] maxSize = {{0}, {850}};
    pgTops = maxSize[0];
    pgBottoms = maxSize[1];
    topMargin = 0;
    bottomMargin = 850;

    tableNum = 0;
    result = new ArrayList<>(0);
    results = (List<String[]>[]) Array.newInstance(result.getClass(), 1);
    results[0] = result;
    currentTable = tab;
    tables = null;
    startNewRow();
  }

  private void startNewRow() {
    columns = new String[currentTable.columnLimits.keySet().size() - 1];
    for (int i = 0; i < columns.length; i++) {
      columns[i] = "";
    }
  }

  private void endRow(String[] columns) {
    // First save any data... may be no data at start of page or end of table, in which case 'columns' will be null
    if (columns != null && !columns[currentTable.requiredColumn].isEmpty()) {
      result.add(columns);
      log.debug(columns[0] + ", " + columns[1] + ", " + columns[2] + (columns.length > 3 ?", " + columns[3] + (columns.length > 4 ? ", " + columns[4]:""):""));
    }
    startNewRow();
  }

  private float prevLine = -1000;
  private String wholeLine = "";

  public void processTextPosition(float xPos, float yPos, char ch) {
    if (log.getLevel() == Level.TRACE) {
      System.out.print(ch);
    }

    // Skip page header & footer (footer may be parsed before body of page)
    if (yPos < topMargin || yPos > bottomMargin) {
      return;
    }

    if (ch == '\u00a0') {
    // Replace unbreakable space with ordinary space for ease of pattern matching
      ch = ' ';
    } else if (ch == '\u00ad') {
    // Replace long dash with ordinary minus for ease of pattern matching
      ch = '-';
    }

    // Check for end-of-table delimiter. This may be at any X position, so checking needs to be
    // separate from column handling
    if (yPos > prevLine) {
      wholeLine = "" + ch;
    } else {
      wholeLine += ch;
      if (wholeLine.equals(currentTable.tableEnd)) {
        // End-of-table delimiter found... step to next table. Any other data on this line is ignored.
        endRow(columns);
        tableNum++;
        log.debug("Table " + tableNum + " delimiter at " + yPos);
        if (tableNum >= tables.length) {
          topMargin = bottomMargin + 1;
          return;
        }
        currentTable = tables[tableNum];
        topMargin = yPos + currentTable.headerHeight;
        result = new ArrayList<>(0);
        results[tableNum] = result;
        startNewRow();
        prevLine = yPos;
        return;
      }
    }
    prevLine = yPos;

    int columnNo = currentTable.columnLimits.floorEntry(xPos).getValue();
    assert (columnNo < columns.length) : "Position " + xPos + " > " + columnNo + " too big";
    // If the 'required' column is empty and this is to the right of the required column, then this is 
    // actually part of the previous line
    if (columnNo > currentTable.requiredColumn
            && columns[currentTable.requiredColumn].isEmpty()
            && !result.isEmpty()) {
      // Add character to previous row
      result.get(result.size() - 1)[columnNo] += ch;
      return;
    }

    if (currentTable.lineEnd != null && currentTable.lineEnd.matcher(columns[columns.length - 1]).matches()) {
      // Full row detected because the end-of-line pattern was detected in the last column.
      endRow(columns);
    } else if (xPos < currentTable.returnColumnNum && !columns[currentTable.requiredColumn].isEmpty()) {
      // Some tables don't have a detectable end-of-line pattern. In that case, the 'lineEnd' value is null,
      // and a full row is detected if the 'required' column is not empty, and this X position is left of it...
      // i.e. it is assumed that the cursor has moved back left of the 'required' column.
      endRow(columns);
    }
    // Add character to appropriate column field.
    columns[columnNo] += ch;
  }

  /**
   * Called once for each page in the PDF document.
   *
   * @throws IOException
   */
  @Override
  public void writePage() throws IOException {
    if (tableNum >= tables.length) {
      return;
    }
    int pageNo = getCurrentPageNo();

    topMargin = pgTops[(pageNo <= pgTops.length ? pageNo : pgTops.length) - 1];
    bottomMargin = pgBottoms[(pageNo <= pgBottoms.length ? pageNo : pgBottoms.length) - 1];
    log.debug("Page " + pageNo + " bounds " + topMargin + "-" + bottomMargin);
    wholeLine = "";
    @SuppressWarnings("UseOfObsoleteCollectionType")
    final java.util.Vector<List<TextPosition>> pageText = getCharactersByArticle();
    for (List<TextPosition> l : pageText) {
      for (TextPosition tp : l) {
        processTextPosition(tp.getX(), tp.getY(), tp.getCharacter().charAt(0));
      }
    }
    endRow(columns);
  }

  /**
   * Convenience method to parse the specified tables in the specified PDF file.
   *
   * @param file PDF file to read
   * @return Array of Lists -- one List per table. Each list contains one String[] entry for each row in the table. Each
   * row is an array of Strings, with one entry for each column in the table.
   * @throws IOException
   */
  @SuppressWarnings("ReturnOfCollectionOrArrayField")
  public List<String[]>[] parse(File file) throws IOException {
    document = PDDocument.load(file);
    getText(document);
    document.close();
    return results;
  }
}

package input.pdf;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;

/**
 *
 * @author <a href="mailto:[email protected]">Frank van der Hulst</a>
 * @version $Revision: 1.00 $
 */
public class TextStripper extends PDFTextStripper {

  private final static Logger log = Logger.getLogger(PDFTableStripper.class.getName());

  protected final int topMargin;
  protected final int bottomMargin;
  protected final int lastPage;
  private final ArrayList<ArrayList<String>> results = new ArrayList<>(10);

  public TextStripper(PDDocument document, int top, int bottom, int numPages) throws IOException {
    super();
    this.document = document;
    this.topMargin = top;
    this.bottomMargin = bottom;
    this.lastPage = numPages;
    setSortByPosition(true);
  }

  @Override
  public void writePage() throws IOException {
    int pageNo = getCurrentPageNo();
    log.debug("Page " + pageNo + " bounds " + topMargin + "-" + bottomMargin);
    if (pageNo > lastPage) {
      return;
    }
    ArrayList<String> result = new ArrayList<>(60);
    results.add(result);
    float currentY = -1000;
    float nextX = 0;
    @SuppressWarnings("UseOfObsoleteCollectionType")
    final java.util.Vector<List<TextPosition>> pageText = getCharactersByArticle();
    String line = "";
    for (List<TextPosition> l : pageText) {
      for (TextPosition tp : l) {
        if (tp.getY() < topMargin || tp.getY() > bottomMargin) {
          continue;
        }
        if (Math.abs(tp.getY() - currentY) > tp.getHeight()/4) {
          if (!line.isEmpty()) {
            if (log.getLevel() == Level.TRACE) {
              System.out.println(line);
            }
            result.add(line);
          }
          line = tp.getCharacter();
          currentY = tp.getY();
        } else {
          float x = tp.getX();
          if (x > nextX + tp.getWidthOfSpace()/4) {
            line += ' ';
          }
          line += tp.getCharacter();
        }
        nextX = tp.getX() + tp.getWidth();
      }
    }
    if (!line.isEmpty()) {
      result.add(line);
    }
  }

  /**
   * Parse the specified tables in the specified PDF file
   *
   * @return Array of Lists -- one List per table. Each list contains one String[] entry for each row in the table. Each
   * row is an array of Strings, with one entry for each column in the table.
   * @throws IOException
   */
  @SuppressWarnings("ReturnOfCollectionOrArrayField")
  public ArrayList<ArrayList<String>> parse() throws IOException {
    getText(document);
    return results;
  }
}

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: PDF extraction

Reply via email to