poppler/TextOutputDev.cc | 14 +++++++++++--- poppler/TextOutputDev.h | 6 ++++++ utils/pdftotext.1 | 3 +++ utils/pdftotext.cc | 9 +++++++++ 4 files changed, 29 insertions(+), 3 deletions(-)
New commits: commit f20d9e5f739b7c8dce74ebc60a6dd1e06106c12e Author: Nelson Benítez León <[email protected]> Date: Sun Jul 11 14:08:58 2021 -0400 TextOutputDev: require more spacing between columns Require more spacing for adjacent text to be considered a separate column of text. We do that by increasing 'minColSpacing1' parameter, which marks the distance, within which, an adjacent word will be pulled to the current block. We provide a way to tweak the default value: double getMinColSpacing1(); void setMinColSpacing1(double val); Fixes issue #1093 diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 5dc37c93..67a6246d 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -122,8 +122,9 @@ #define maxWordSpacing 1.5 // Maximum horizontal spacing which will allow a word to be pulled -// into a block. -#define minColSpacing1 0.3 +// into a block, as a fraction of the font size. +// This default value can be tweaked via API. +double TextOutputDev::minColSpacing1_default = 0.7; // Minimum spacing between columns, as a fraction of the font size. #define minColSpacing2 1.0 @@ -2814,6 +2815,11 @@ void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link) } void TextPage::coalesce(bool physLayout, double fixedPitch, bool doHTML) +{ + coalesce(physLayout, fixedPitch, doHTML, TextOutputDev::minColSpacing1_default); +} + +void TextPage::coalesce(bool physLayout, double fixedPitch, bool doHTML, double minColSpacing1) { TextWord *word0, *word1, *word2; TextLine *line; @@ -5605,6 +5611,7 @@ TextOutputDev::TextOutputDev(const char *fileName, bool physLayoutA, double fixe textEOL = defaultEndOfLine(); textPageBreaks = true; ok = true; + minColSpacing1 = minColSpacing1_default; // open file needClose = false; @@ -5648,6 +5655,7 @@ TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream, bool physLayoutA textEOL = defaultEndOfLine(); textPageBreaks = true; ok = true; + minColSpacing1 = minColSpacing1_default; } TextOutputDev::~TextOutputDev() @@ -5669,7 +5677,7 @@ void TextOutputDev::startPage(int pageNum, GfxState *state, XRef *xref) void TextOutputDev::endPage() { text->endPage(); - text->coalesce(physLayout, fixedPitch, doHTML); + text->coalesce(physLayout, fixedPitch, doHTML, minColSpacing1); if (outputStream) { text->dump(outputStream, outputFunc, physLayout, textEOL, textPageBreaks); } diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h index 984507ba..9df36278 100644 --- a/poppler/TextOutputDev.h +++ b/poppler/TextOutputDev.h @@ -596,6 +596,7 @@ public: // Coalesce strings that look like parts of the same line. void coalesce(bool physLayout, double fixedPitch, bool doHTML); + void coalesce(bool physLayout, double fixedPitch, bool doHTML, double minColSpacing1); // Find a string. If <startAtTop> is true, starts looking at the // top of the page; else if <startAtLast> is true, starts looking @@ -756,6 +757,8 @@ private: class POPPLER_PRIVATE_EXPORT TextOutputDev : public OutputDev { public: + static double minColSpacing1_default; + // Open a text output file. If <fileName> is NULL, no file is // written (this is useful, e.g., for searching text). If // <physLayoutA> is true, the original physical layout of the text @@ -885,6 +888,8 @@ public: } void setTextEOL(EndOfLineKind textEOLA) { textEOL = textEOLA; } void setTextPageBreaks(bool textPageBreaksA) { textPageBreaks = textPageBreaksA; } + double getMinColSpacing1() const { return minColSpacing1; } + void setMinColSpacing1(double val) { minColSpacing1 = val; } private: TextOutputFunc outputFunc; // output function @@ -897,6 +902,7 @@ private: double fixedPitch; // if physLayout is true and this is non-zero, // assume fixed-pitch characters with this // width + double minColSpacing1; // see default value defined with same name at TextOutputDev.cc bool rawOrder; // keep text in content stream order bool discardDiag; // Diagonal text, i.e., text that is not close to one of the // 0, 90, 180, or 270 degree axes, is discarded. This is useful diff --git a/utils/pdftotext.1 b/utils/pdftotext.1 index 3ae217b4..39163389 100644 --- a/utils/pdftotext.1 +++ b/utils/pdftotext.1 @@ -85,6 +85,9 @@ block, line, and word in the file. .B \-cropbox Use the crop box rather than the media box with \-bbox and \-bbox-layout. .TP +.BI \-colspacing " number" +Specifies how much spacing we allow after a word before considering adjacent text to be a new column, measured as a fraction of the font size. Current default is 0.7, old releases had a 0.3 default. +.TP .BI \-enc " encoding-name" Sets the encoding to use for text output. This defaults to "UTF-8". .TP diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc index 7b45359f..0caca87f 100644 --- a/utils/pdftotext.cc +++ b/utils/pdftotext.cc @@ -84,6 +84,7 @@ static bool bbox = false; static bool bboxLayout = false; static bool physLayout = false; static bool useCropBox = false; +static double colspacing = TextOutputDev::minColSpacing1_default; static double fixedPitch = 0; static bool rawOrder = false; static bool discardDiag = false; @@ -117,6 +118,8 @@ static const ArgDesc argDesc[] = { { "-f", argInt, &firstPage, 0, "first page to { "-bbox", argFlag, &bbox, 0, "output bounding box for each word and page size to html. Sets -htmlmeta" }, { "-bbox-layout", argFlag, &bboxLayout, 0, "like -bbox but with extra layout bounding box data. Sets -htmlmeta" }, { "-cropbox", argFlag, &useCropBox, 0, "use the crop box rather than media box" }, + { "-colspacing", argFP, &colspacing, 0, + "how much spacing we allow after a word before considering adjacent text to be a new column, as a fraction of the font size (default is 0.7, old releases had a 0.3 default)" }, { "-opw", argString, ownerPassword, sizeof(ownerPassword), "owner password (for encrypted files)" }, { "-upw", argString, userPassword, sizeof(userPassword), "user password (for encrypted files)" }, { "-q", argFlag, &quiet, 0, "don't print any messages or errors" }, @@ -178,6 +181,10 @@ int main(int argc, char *argv[]) if (bbox) { htmlMeta = true; } + if (colspacing <= 0 || colspacing > 10) { + error(errCommandLine, -1, "Bogus value provided for -colspacing"); + goto err1; + } if (!ok || (argc < 2 && !printEnc) || argc > 3 || printVersion || printHelp) { fprintf(stderr, "pdftotext version %s\n", PACKAGE_VERSION); fprintf(stderr, "%s\n", popplerCopyright); @@ -342,6 +349,7 @@ int main(int argc, char *argv[]) if (textOut->isOk()) { textOut->setTextEOL(textEOL); + textOut->setMinColSpacing1(colspacing); if (noPageBreaks) { textOut->setTextPageBreaks(false); } @@ -358,6 +366,7 @@ int main(int argc, char *argv[]) textOut = new TextOutputDev(textFileName->c_str(), physLayout, fixedPitch, rawOrder, htmlMeta, discardDiag); if (textOut->isOk()) { textOut->setTextEOL(textEOL); + textOut->setMinColSpacing1(colspacing); if (noPageBreaks) { textOut->setTextPageBreaks(false); }
