poppler/TextOutputDev.cc | 39 +++++++++++++++++++++++++++++++++------ poppler/TextOutputDev.h | 17 ++++++++++++----- utils/pdftotext.1 | 5 +++++ utils/pdftotext.cc | 7 +++++-- 4 files changed, 55 insertions(+), 13 deletions(-)
New commits: commit 54f799e6fda99cf0cc826884247d92c6dc36d8e7 Author: Dan Shea <[email protected]> Date: Thu Aug 1 22:11:44 2019 +0000 Add pdftotext -nodiag flag to remove diagonal text on output diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 645e38fd..f2569fbd 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -177,6 +177,10 @@ #define combMaxMidDelta 0.3 #define combMaxBaseDelta 0.4 +// Text is considered diagonal if abs(tan(angle)) > diagonalThreshold. +// (Or 1/tan(angle) for 90/270 degrees.) +#define diagonalThreshold 0.1 + namespace { inline bool isAscii7 (Unicode uchar) { @@ -2357,11 +2361,12 @@ TextWord *TextWordList::get(int idx) { // TextPage //------------------------------------------------------------------------ -TextPage::TextPage(bool rawOrderA) { +TextPage::TextPage(bool rawOrderA, bool discardDiagA) { int rot; refCnt = 1; rawOrder = rawOrderA; + discardDiag = discardDiagA; curWord = nullptr; charPos = 0; curFont = nullptr; @@ -2384,6 +2389,7 @@ TextPage::TextPage(bool rawOrderA) { underlines = new std::vector<TextUnderline*>(); links = new std::vector<TextLink*>(); mergeCombining = true; + diagonal = false; } TextPage::~TextPage() { @@ -2470,6 +2476,7 @@ void TextPage::clear() { } delete links; + diagonal = false; curWord = nullptr; charPos = 0; curFont = nullptr; @@ -2592,6 +2599,11 @@ void TextPage::beginWord(GfxState *state) { } else { rot = (m[2] > 0) ? 1 : 3; } + if (fabs(m[0]) >= fabs(m[1])) { + diagonal = fabs(m[1]) > diagonalThreshold * fabs(m[0]); + } else { + diagonal = fabs(m[0]) > diagonalThreshold * fabs(m[1]); + } // for vertical writing mode, the lines are effectively rotated 90 // degrees @@ -2720,6 +2732,12 @@ void TextPage::addChar(GfxState *state, double x, double y, beginWord(state); } + // throw away diagonal chars + if (discardDiag && diagonal) { + charPos += nBytes; + return; + } + // page rotation and/or transform matrices can cause text to be // drawn in reverse order -- in this case, swap the begin/end // coordinates and break text into individual chars @@ -2729,6 +2747,13 @@ void TextPage::addChar(GfxState *state, double x, double y, (curWord->rot == 3 && h1 > 0)) { endWord(); beginWord(state); + + // throw away diagonal chars + if (discardDiag && diagonal) { + charPos += nBytes; + return; + } + x1 += w1; y1 += h1; w1 = -w1; @@ -5648,11 +5673,12 @@ static void TextOutputDev_outputToFile(void *stream, const char *text, int len) TextOutputDev::TextOutputDev(const char *fileName, bool physLayoutA, double fixedPitchA, bool rawOrderA, - bool append) { + bool append, bool discardDiagA) { text = nullptr; physLayout = physLayoutA; fixedPitch = physLayout ? fixedPitchA : 0; rawOrder = rawOrderA; + discardDiag = discardDiagA; doHTML = false; ok = true; @@ -5679,21 +5705,22 @@ TextOutputDev::TextOutputDev(const char *fileName, bool physLayoutA, } // set up text object - text = new TextPage(rawOrderA); + text = new TextPage(rawOrderA, discardDiagA); actualText = new ActualText(text); } TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream, bool physLayoutA, double fixedPitchA, - bool rawOrderA) { + bool rawOrderA, bool discardDiagA) { outputFunc = func; outputStream = stream; needClose = false; physLayout = physLayoutA; fixedPitch = physLayout ? fixedPitchA : 0; rawOrder = rawOrderA; + discardDiag = discardDiagA; doHTML = false; - text = new TextPage(rawOrderA); + text = new TextPage(rawOrderA, discardDiagA); actualText = new ActualText(text); ok = true; } @@ -5961,7 +5988,7 @@ TextPage *TextOutputDev::takeText() { TextPage *ret; ret = text; - text = new TextPage(rawOrder); + text = new TextPage(rawOrder, discardDiag); return ret; } diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h index 3ff1754a..7a29c8a0 100644 --- a/poppler/TextOutputDev.h +++ b/poppler/TextOutputDev.h @@ -553,7 +553,7 @@ class TextPage { public: // Constructor. - TextPage(bool rawOrderA); + TextPage(bool rawOrderA, bool discardDiagA = false); TextPage(const TextPage &) = delete; TextPage& operator=(const TextPage &) = delete; @@ -685,6 +685,7 @@ private: int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GooString *s); bool rawOrder; // keep text in content stream order + bool discardDiag; // discard diagonal text bool mergeCombining; // merge when combining and base characters // are drawn on top of each other @@ -698,6 +699,7 @@ private: int nTinyChars; // number of "tiny" chars seen so far bool lastCharOverlap; // set if the last added char overlapped the // previous char + bool diagonal; // whether the current text is diagonal TextPool *pools[4]; // a "pool" of TextWords for each rotation TextFlow *flows; // linked list of flows @@ -772,18 +774,20 @@ public: // written (this is useful, e.g., for searching text). If // <physLayoutA> is true, the original physical layout of the text // is maintained. If <rawOrder> is true, the text is kept in - // content stream order. + // content stream order. If <discardDiag> is true, diagonal text + // is removed from output. TextOutputDev(const char *fileName, bool physLayoutA, double fixedPitchA, bool rawOrderA, - bool append); + bool append, bool discardDiagA = false); // Create a TextOutputDev which will write to a generic stream. If // <physLayoutA> is true, the original physical layout of the text // is maintained. If <rawOrder> is true, the text is kept in - // content stream order. + // content stream order. If <discardDiag> is true, diagonal text + // is removed from output. TextOutputDev(TextOutputFunc func, void *stream, bool physLayoutA, double fixedPitchA, - bool rawOrderA); + bool rawOrderA, bool discardDiagA = false); // Destructor. ~TextOutputDev(); @@ -920,6 +924,9 @@ private: // assume fixed-pitch characters with this // width bool rawOrder; // keep text in content stream order + bool discardDiag; // Diagonal text, i.e., text that is not close to one of the + //0, 90, 180, or 270 degree axes, is discarded. This is useful + // to skip watermarks drawn on top of body text, etc. bool doHTML; // extra processing for HTML conversion bool ok; // set up ok? diff --git a/utils/pdftotext.1 b/utils/pdftotext.1 index f1a0cb41..dd114e2c 100644 --- a/utils/pdftotext.1 +++ b/utils/pdftotext.1 @@ -62,6 +62,11 @@ Keep the text in content stream order. This is a hack which often "undoes" column formatting, etc. Use of raw mode is no longer recommended. .TP +.B \-nodiag +Discard diagonal text (i.e., text that is not close to one of the +0, 90, 180, or 270 degree axes). This is useful for skipping +watermarks drawn on body text. +.TP .B \-htmlmeta Generate a simple HTML file, including the meta information. This simply wraps the text in <pre> and </pre> and prepends the meta diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc index 34b8f87d..88154ac3 100644 --- a/utils/pdftotext.cc +++ b/utils/pdftotext.cc @@ -83,6 +83,7 @@ static bool bboxLayout = false; static bool physLayout = false; static double fixedPitch = 0; static bool rawOrder = false; +static bool discardDiag = false; static bool htmlMeta = false; static char textEncName[128] = ""; static char textEOL[16] = ""; @@ -115,6 +116,8 @@ static const ArgDesc argDesc[] = { "assume fixed-pitch (or tabular) text"}, {"-raw", argFlag, &rawOrder, 0, "keep strings in content stream order"}, + {"-nodiag", argFlag, &discardDiag, 0, + "discard diagonal text"}, {"-htmlmeta", argFlag, &htmlMeta, 0, "generate a simple HTML file, including the meta information"}, {"-enc", argString, textEncName, sizeof(textEncName), @@ -363,7 +366,7 @@ int main(int argc, char *argv[]) { // write text file if (htmlMeta && bbox) { // htmlMeta && is superfluous but makes gcc happier - textOut = new TextOutputDev(nullptr, physLayout, fixedPitch, rawOrder, htmlMeta); + textOut = new TextOutputDev(nullptr, physLayout, fixedPitch, rawOrder, htmlMeta, discardDiag); if (textOut->isOk()) { if (bboxLayout) { @@ -378,7 +381,7 @@ int main(int argc, char *argv[]) { } } else { textOut = new TextOutputDev(textFileName->c_str(), - physLayout, fixedPitch, rawOrder, htmlMeta); + physLayout, fixedPitch, rawOrder, htmlMeta, discardDiag); if (textOut->isOk()) { if ((w==0) && (h==0) && (x==0) && (y==0)) { doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0, _______________________________________________ poppler mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/poppler
