[ 
https://issues.apache.org/jira/browse/PDFBOX-5545?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Owen McGovern updated PDFBOX-5545:
----------------------------------
    Description: 
I process a lot of medical related PDF files with a lot of superscripts, 
subscripts, out of order characters etc.

We tend to have trouble with the sortByPosition flag in PDFTextStripper.

If it's not enabled, we end up with characters which are out of order in some 
PDFs.

If we do enable it it sometimes messes up superscript and subscript positions.

Can you expose a setter for the comparator instance, so that I can try to 
correct it ? E.g. 

 
{code:java}
private Comparator<TextPosition> textPositionComparator = new 
TextPositionComparator();

    /**
     *
     * @param newTextPositionComparator
     */
    public void setTextPositionComparator(final Comparator<TextPosition> 
newTextPositionComparator) {
        this.textPositionComparator = newTextPositionComparator;
    }
 {code}
Then in the writePage() method, just use that comparator?

 

Users can then potentially inject their own comparator implementation in.

I want to try to implement a comparator that fixes sorting with 
subscript/superscript tolerances, eg. something like this (in Kotlin)

 
{code:java}
package com.teckro.platform.rest.extractor.pdf.parsing

import mu.KLogging
import org.apache.pdfbox.text.TextPosition
import kotlin.math.abs

class TextPositionSubscriptComparator : Comparator<TextPosition>, KLogging() {

    override fun compare(pos1: TextPosition, pos2: TextPosition): Int {

        val textDir = pos1.dir.compareTo(pos2.dir)
        return if (textDir != 0) {
            textDir
        } else {
            val x1 = pos1.xDirAdj
            val x2 = pos2.xDirAdj
            val pos1YBottom = pos1.yDirAdj
            val pos2YBottom = pos2.yDirAdj
            val yDifference = abs(pos1YBottom - pos2YBottom)

            val result = if (yDifference < 0.1f) {
                x1.compareTo(x2)
            } else {
                val range1 = Pair(pos1.yDirAdj - OUT_OF_LINE_TOLERANCE, 
pos1.yDirAdj + pos1.heightDir + OUT_OF_LINE_TOLERANCE)
                val range2 = Pair(pos2.yDirAdj - OUT_OF_LINE_TOLERANCE, 
pos2.yDirAdj + pos2.heightDir + OUT_OF_LINE_TOLERANCE)

                if (range1.overlap(range2) || range2.overlap(range1)) {
                    x1.compareTo(x2)
                } else {
                    if (pos1YBottom < pos2YBottom) -1 else 1
                }
            }

//            logger.info { "result = $result, [${pos1.unicode}], x1=${pos1.x}, 
y1=${pos1.y} ---- [${pos2.unicode}], x2=${pos2.x}, y2=${pos1.y}"  }

            return result
        }

    }

    companion object {
        private const val OUT_OF_LINE_TOLERANCE = 2f
    }
}

/**
 * Checks whether a numeric range overlaps with another
 */
fun Pair<Float, Float>.overlap(other: Pair<Float, Float>) =
        !(first > other.second || second < other.first)

{code}
 

It could greatly help if the sorting comparator was configurable.

 

regards,

Owen

 

  was:
I process a lot of medical related PDF files with a lot of superscripts, 
subscripts, out of order characters etc.

We tend to have trouble with the sortByPosition flag in PDFTextStripper.

If it's not enabled, we end up with characters which are out of order in some 
PDFs.

If we do enable it it sometimes messes up superscript and subscript positions.

Can you expose a setter for the comparator instance, so that I can try to 
correct it ? E.g. 

 
{code:java}
private Comparator<TextPosition> textPositionComparator = new 
TextPositionComparator();

    /**
     *
     * @param newTextPositionComparator
     */
    public void setTextPositionComparator(final Comparator<TextPosition> 
newTextPositionComparator) {
        this.textPositionComparator = newTextPositionComparator;
    }
 {code}
Then in the writePage() method, just use that comparator?

 

Users can then potentially inject their own comparator implementation in.

I want to try to implement a comparator that fixes sorting with 
subscript/superscript tolerances, eg. something like this (in Kotlin and 
completely untested so far... )

 
{code:java}
import mu.KLogging
import org.apache.pdfbox.text.TextPosition
import kotlin.math.absclass TextPositionSubscriptComparator : 
Comparator<TextPosition>, KLogging() {    override fun compare(pos1: 
TextPosition, pos2: TextPosition): Int {        val textDir = 
pos1.dir.compareTo(pos2.dir)
        return if (textDir != 0) {
            textDir
        } else {
            val x1 = pos1.xDirAdj
            val x2 = pos2.xDirAdj
            val pos1YBottom = pos1.yDirAdj
            val pos2YBottom = pos2.yDirAdj
            val yDifference = abs(pos1YBottom - pos2YBottom)            val 
result = if (yDifference < 0.1f) {
                x1.compareTo(x2)
            } else {
                val range1 = Pair(pos1.yDirAdj - OUT_OF_LINE_TOLERANCE, 
pos1.yDirAdj + pos1.heightDir + OUT_OF_LINE_TOLERANCE)
                val range2 = Pair(pos2.yDirAdj - OUT_OF_LINE_TOLERANCE, 
pos2.yDirAdj + pos2.heightDir + OUT_OF_LINE_TOLERANCE)                if 
(range1.overlap(range2) || range2.overlap(range1)) {
                    x1.compareTo(x2)
                } else {
                    if (pos1YBottom < pos2YBottom) -1 else 1
                }
            }//            logger.info { "result = $result, [${pos1.unicode}], 
x1=${pos1.x}, y1=${pos1.y} ---- [${pos2.unicode}], x2=${pos2.x}, y2=${pos1.y}"  
}            return result
        }    }    companion object {
        private const val OUT_OF_LINE_TOLERANCE = 2f
    }
}/**
 * Checks whether a numeric range overlaps with another
 */
fun Pair<Float, Float>.overlap(other: Pair<Float, Float>) =
        !(first > other.second || second < other.first)
{code}
 

It could greatly help if the sorting comparator was configurable.

 

regards,

Owen

 


> PDFTextStripper - Expose a setter for the TextPositionComparator
> ----------------------------------------------------------------
>
>                 Key: PDFBOX-5545
>                 URL: https://issues.apache.org/jira/browse/PDFBOX-5545
>             Project: PDFBox
>          Issue Type: Improvement
>          Components: Text extraction
>    Affects Versions: 3.0.0 PDFBox
>            Reporter: Owen McGovern
>            Priority: Major
>
> I process a lot of medical related PDF files with a lot of superscripts, 
> subscripts, out of order characters etc.
> We tend to have trouble with the sortByPosition flag in PDFTextStripper.
> If it's not enabled, we end up with characters which are out of order in some 
> PDFs.
> If we do enable it it sometimes messes up superscript and subscript positions.
> Can you expose a setter for the comparator instance, so that I can try to 
> correct it ? E.g. 
>  
> {code:java}
> private Comparator<TextPosition> textPositionComparator = new 
> TextPositionComparator();
>     /**
>      *
>      * @param newTextPositionComparator
>      */
>     public void setTextPositionComparator(final Comparator<TextPosition> 
> newTextPositionComparator) {
>         this.textPositionComparator = newTextPositionComparator;
>     }
>  {code}
> Then in the writePage() method, just use that comparator?
>  
> Users can then potentially inject their own comparator implementation in.
> I want to try to implement a comparator that fixes sorting with 
> subscript/superscript tolerances, eg. something like this (in Kotlin)
>  
> {code:java}
> package com.teckro.platform.rest.extractor.pdf.parsing
> import mu.KLogging
> import org.apache.pdfbox.text.TextPosition
> import kotlin.math.abs
> class TextPositionSubscriptComparator : Comparator<TextPosition>, KLogging() {
>     override fun compare(pos1: TextPosition, pos2: TextPosition): Int {
>         val textDir = pos1.dir.compareTo(pos2.dir)
>         return if (textDir != 0) {
>             textDir
>         } else {
>             val x1 = pos1.xDirAdj
>             val x2 = pos2.xDirAdj
>             val pos1YBottom = pos1.yDirAdj
>             val pos2YBottom = pos2.yDirAdj
>             val yDifference = abs(pos1YBottom - pos2YBottom)
>             val result = if (yDifference < 0.1f) {
>                 x1.compareTo(x2)
>             } else {
>                 val range1 = Pair(pos1.yDirAdj - OUT_OF_LINE_TOLERANCE, 
> pos1.yDirAdj + pos1.heightDir + OUT_OF_LINE_TOLERANCE)
>                 val range2 = Pair(pos2.yDirAdj - OUT_OF_LINE_TOLERANCE, 
> pos2.yDirAdj + pos2.heightDir + OUT_OF_LINE_TOLERANCE)
>                 if (range1.overlap(range2) || range2.overlap(range1)) {
>                     x1.compareTo(x2)
>                 } else {
>                     if (pos1YBottom < pos2YBottom) -1 else 1
>                 }
>             }
> //            logger.info { "result = $result, [${pos1.unicode}], 
> x1=${pos1.x}, y1=${pos1.y} ---- [${pos2.unicode}], x2=${pos2.x}, 
> y2=${pos1.y}"  }
>             return result
>         }
>     }
>     companion object {
>         private const val OUT_OF_LINE_TOLERANCE = 2f
>     }
> }
> /**
>  * Checks whether a numeric range overlaps with another
>  */
> fun Pair<Float, Float>.overlap(other: Pair<Float, Float>) =
>         !(first > other.second || second < other.first)
> {code}
>  
> It could greatly help if the sorting comparator was configurable.
>  
> regards,
> Owen
>  



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to