ok - I've done a thorough evaluation of what would be required to properly suppress overlapping spaces (but leave in the non-overlapping spaces), and a lot of pathfinding.
What is really needed is to compare two non-space positions and see if there *should* be a space between them, and if not, then suppress any spaces that appear between them. I know how to do this, but it would require a good bit of surgery. Basically, we would need to change to accumulating text positions by line (without processing them). Then once a line is accumulated, run through the line and insert word separators (and remove any overlapping spaces). I would probably do this by moving the word separation logic into the normalize(line) method. But that is a lot of surgery. My concern is that determining if two non-space glyphs are actually adjacent requires making a determination based on the width of a space glyph - and that introduces the risk of suppressing an overlapping space when it shouldn't. I'm not comfortable with that sort of regression - especially for something that is not adding a ton of value beyond what we can achieve by just removing Tj spaces entirely. If someone really cares about preserving the physical layout of the page, I believe it would be better to put time into making normalize() add space characters (or tabs maybe) based on the physical layout of the non-space characters, instead of attempting to preserve any spaces that are part of the content stream operations. So for now, I am going to suggest that we implement the setting to ignore space glyphs in the content stream (as suggested in https://issues.apache.org/jira/browse/PDFBOX-3774 ). I'm including a patch that does this below (one source change, plus a self-contained unit test). I wish this patch was more valuable! But if it helps anything, this change has a dramatic effect on the robustness of our application, so there is a ton of value to us to having PDFBox add this setting in 3.0.4. I'm happy to discuss the above further - I'm just not seeing a huge amount of value compared to the blanket removal of spaces... Index: pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java =================================================================== --- pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java (revision 1922522) +++ pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java (working copy) @@ -40,6 +40,7 @@ import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Pattern; +import java.util.stream.Collectors; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -147,6 +148,7 @@ private boolean shouldSeparateByBeads = true; private boolean sortByPosition = false; private boolean addMoreFormatting = false; + private boolean ignoreContentStreamSpaceGlyphs = false; private float indentThreshold = defaultIndentThreshold; private float dropThreshold = defaultDropThreshold; @@ -524,11 +526,10 @@ { IterativeMergeSort.sort(textList, comparator); } - finally - { - // PDFBOX-5487: Remove all space characters if contained within the adjacent letters - removeContainedSpaces(textList); - } + + // PDFBOX-5487: Remove all space characters if contained within the adjacent letters + removeContainedSpaces(textList); + } startArticle(); @@ -556,6 +557,10 @@ PositionWrapper current = new PositionWrapper(position); String characterValue = position.getUnicode(); + // PDFBOX-3774 - conditionally ignore spaces from the content stream + if (" ".equals(characterValue) && getIgnoreContentStreamSpaceGlyphs()) + continue; + // Resets the average character width when we see a change in font // or a change in the font size if (lastPosition != null && @@ -1273,6 +1278,30 @@ sortByPosition = newSortByPosition; } + + /** + * This setting only applies if sortByPosition has been set to true. + * + * Determines whether spaces in the content stream text rendering instructions will be ignored during text extraction. + * + * @return whether space glyphs in the content stream text rendering instructions will be ignored during text extraction - default is false + */ + public boolean getIgnoreContentStreamSpaceGlyphs() { + return ignoreContentStreamSpaceGlyphs; + } + + /** + * This setting only applies if sortByPosition has been set to true. + * + * Instruct the algorithm to ignore any spaces in the text rendering instructions in the content stream, and instead rely purely on the algorithm to determine where word breaks are. + * This can improve text extraction results where the content stream has text overlapping spaces, but could cause some word breaks to not be added to the output + * + * @param newIgnoreRenderedSpaces whether PDF Box should ignore context stream spaces + */ + public void setIgnoreContentStreamSpaceGlyphs(boolean newIgnoreContentStreamSpaceGlyphs) { + ignoreContentStreamSpaceGlyphs = newIgnoreContentStreamSpaceGlyphs; + } + /** * Get the current space width-based tolerance value that is being used to estimate where spaces in text should be * added. Note that the default value for this has been determined from trial and error. Index: pdfbox/src/test/java/org/apache/pdfbox/text/PDFTextStripperOverlapTest.java =================================================================== --- pdfbox/src/test/java/org/apache/pdfbox/text/PDFTextStripperOverlapTest.java (revision 0) +++ pdfbox/src/test/java/org/apache/pdfbox/text/PDFTextStripperOverlapTest.java (revision 0) @@ -0,0 +1,58 @@ +package org.apache.pdfbox.text; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts.FontName; +import org.junit.jupiter.api.Test; + +public class PDFTextStripperOverlapTest { + + @Test + void testIgnoreContentStreamSpaceGlyphs() throws Exception + { + try (PDDocument doc = new PDDocument()) + { + PDPage page = new PDPage(); + try (PDPageContentStream cs = new PDPageContentStream(doc, page)) + { + float fontHeight = 8; + float x = 50; + float y = page.getMediaBox().getHeight() - 50; + PDFont font = new PDType1Font(FontName.HELVETICA); + cs.beginText(); + cs.setFont(font, fontHeight); + cs.newLineAtOffset(x, y); + cs.showText("( )"); + cs.endText(); + + int indent = 6; + float overlapX = x + indent * font.getAverageFontWidth()/1000f*fontHeight; + PDFont overlapFont = new PDType1Font(FontName.TIMES_ROMAN); + cs.beginText(); + cs.setFont(overlapFont, fontHeight*2f); + cs.newLineAtOffset(overlapX, y); + cs.showText("overlap"); + cs.endText(); + } + doc.addPage(page); + + PDFTextStripper stripper = new PDFTextStripper(); + stripper.setLineSeparator("\n"); + stripper.setPageEnd("\n"); + stripper.setStartPage(1); + stripper.setEndPage(1); + stripper.setSortByPosition(true); + + stripper.setIgnoreContentStreamSpaceGlyphs(true); + String text = stripper.getText(doc); + assertEquals("( overlap )\n", text); + + } + } + +}