[
https://issues.apache.org/jira/browse/PDFBOX-5545?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Owen McGovern updated PDFBOX-5545:
----------------------------------
Description:
I process a lot of medical related PDF files with a lot of superscripts,
subscripts, out of order characters etc.
We tend to have trouble with the sortByPosition flag in PDFTextStripper.
If it's not enabled, we end up with characters which are out of order in some
PDFs.
If we do enable it it sometimes messes up superscript and subscript positions.
Can you expose a setter for the comparator instance, so that I can try to
correct it ? E.g.
{code:java}
private Comparator<TextPosition> textPositionComparator = new
TextPositionComparator();
/**
*
* @param newTextPositionComparator
*/
public void setTextPositionComparator(final Comparator<TextPosition>
newTextPositionComparator) {
this.textPositionComparator = newTextPositionComparator;
}
{code}
Then in the writePage() method, just use that comparator?
I want to try to implement a comparator that fixes this eg. something like this
(in Kotlin)
{code:java}
import org.apache.pdfbox.text.TextPosition
import kotlin.math.abs
class TextPositionSubscriptComparator : Comparator<TextPosition> {
override fun compare(pos1: TextPosition, pos2: TextPosition): Int {
val textDir = pos1.dir.compareTo(pos2.dir)
return if (textDir != 0) {
textDir
} else {
val x1 = pos1.xDirAdj
val x2 = pos2.xDirAdj
val pos1YBottom = pos1.yDirAdj
val pos2YBottom = pos2.yDirAdj
val pos1YTop = pos1YBottom - pos1.heightDir
val pos2YTop = pos2YBottom - pos2.heightDir
val yDifference = abs(pos1YBottom - pos2YBottom)
// Superscript / subscript tolerance by ratio of the character
height
val overlap = if (pos1.heightDir > pos2.heightDir)
pos1.heightDir * INV_SIZE_RATIO_DIFFERENCE
else
pos2.heightDir * INV_SIZE_RATIO_DIFFERENCE
if ((yDifference.toDouble() < overlap || pos2YBottom >= pos1YTop)
&& pos2YBottom <= pos1YBottom || pos1YBottom in pos2YTop..pos2YBottom) {
x1.compareTo(x2)
} else {
if (pos1YBottom < pos2YBottom) -1 else 1
}
}
}
companion object {
private const val SIZE_RATIO_DIFFERENCE = 0.85f
private const val INV_SIZE_RATIO_DIFFERENCE = 1f - SIZE_RATIO_DIFFERENCE
}
}
{code}
was:
I process a lot of medical related PDF files with a lot of superscripts,
subscripts, out of order characters etc.
We tend to have trouble with the sortByPosition flag in PDFTextStripper.
If it's not enabled, we end up with characters which are out of order in some
PDFs.
If we do enable it it sometimes messes up superscript and subscript positions.
Can you expose a setter for the comparator instance, so that I can try to
correct it ? E.g.
{code:java}
private Comparator<TextPosition> textPositionComparator = new
TextPositionComparator();
/**
*
* @param newTextPositionComparator
*/
public void setTextPositionComparator(final Comparator<TextPosition>
newTextPositionComparator) {
this.textPositionComparator = newTextPositionComparator;
}
{code}
Then in the writePage() method, just use that comparator?
I want to try to implement a comparator that fixes this eg. something like this
(in Kotlin)
{code:java}
import org.apache.pdfbox.text.TextPosition
import kotlin.math.abs
class TextPositionSubscriptComparator : Comparator<TextPosition> {
override fun compare(pos1: TextPosition, pos2: TextPosition): Int {
val textDir = pos1.dir.compareTo(pos2.dir)
return if (textDir != 0) {
textDir
} else {
val x1 = pos1.xDirAdj
val x2 = pos2.xDirAdj
val pos1YBottom = pos1.yDirAdj
val pos2YBottom = pos2.yDirAdj
val pos1YTop = pos1YBottom - pos1.heightDir
val pos2YTop = pos2YBottom - pos2.heightDir
val yDifference = abs(pos1YBottom - pos2YBottom)
// Superscript / subscript detection - when height differences
between non-blank characters
val overlap = if (pos1.heightDir > pos2.heightDir)
pos1.heightDir * INV_SIZE_RATIO_DIFFERENCE
else
pos2.heightDir * INV_SIZE_RATIO_DIFFERENCE
if ((yDifference.toDouble() < overlap || pos2YBottom >= pos1YTop)
&& pos2YBottom <= pos1YBottom || pos1YBottom in pos2YTop..pos2YBottom) {
x1.compareTo(x2)
} else {
if (pos1YBottom < pos2YBottom) -1 else 1
}
}
}
companion object {
private const val SIZE_RATIO_DIFFERENCE = 0.85f
private const val INV_SIZE_RATIO_DIFFERENCE = 1f - SIZE_RATIO_DIFFERENCE
}
}
{code}
> PDFTextStripper - Expose a setter for the TextPositionComparator
> ----------------------------------------------------------------
>
> Key: PDFBOX-5545
> URL: https://issues.apache.org/jira/browse/PDFBOX-5545
> Project: PDFBox
> Issue Type: Improvement
> Components: Text extraction
> Affects Versions: 3.0.4 JBIG2
> Reporter: Owen McGovern
> Priority: Major
>
> I process a lot of medical related PDF files with a lot of superscripts,
> subscripts, out of order characters etc.
> We tend to have trouble with the sortByPosition flag in PDFTextStripper.
> If it's not enabled, we end up with characters which are out of order in some
> PDFs.
> If we do enable it it sometimes messes up superscript and subscript positions.
> Can you expose a setter for the comparator instance, so that I can try to
> correct it ? E.g.
>
> {code:java}
> private Comparator<TextPosition> textPositionComparator = new
> TextPositionComparator();
> /**
> *
> * @param newTextPositionComparator
> */
> public void setTextPositionComparator(final Comparator<TextPosition>
> newTextPositionComparator) {
> this.textPositionComparator = newTextPositionComparator;
> }
> {code}
> Then in the writePage() method, just use that comparator?
> I want to try to implement a comparator that fixes this eg. something like
> this (in Kotlin)
>
> {code:java}
> import org.apache.pdfbox.text.TextPosition
> import kotlin.math.abs
> class TextPositionSubscriptComparator : Comparator<TextPosition> {
> override fun compare(pos1: TextPosition, pos2: TextPosition): Int {
> val textDir = pos1.dir.compareTo(pos2.dir)
> return if (textDir != 0) {
> textDir
> } else {
> val x1 = pos1.xDirAdj
> val x2 = pos2.xDirAdj
> val pos1YBottom = pos1.yDirAdj
> val pos2YBottom = pos2.yDirAdj
> val pos1YTop = pos1YBottom - pos1.heightDir
> val pos2YTop = pos2YBottom - pos2.heightDir
> val yDifference = abs(pos1YBottom - pos2YBottom)
> // Superscript / subscript tolerance by ratio of the character
> height
> val overlap = if (pos1.heightDir > pos2.heightDir)
> pos1.heightDir * INV_SIZE_RATIO_DIFFERENCE
> else
> pos2.heightDir * INV_SIZE_RATIO_DIFFERENCE
> if ((yDifference.toDouble() < overlap || pos2YBottom >= pos1YTop)
> && pos2YBottom <= pos1YBottom || pos1YBottom in pos2YTop..pos2YBottom) {
> x1.compareTo(x2)
> } else {
> if (pos1YBottom < pos2YBottom) -1 else 1
> }
> }
> }
> companion object {
> private const val SIZE_RATIO_DIFFERENCE = 0.85f
> private const val INV_SIZE_RATIO_DIFFERENCE = 1f -
> SIZE_RATIO_DIFFERENCE
> }
> }
> {code}
>
>
>
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]