utils/HtmlOutputDev.cc | 7 +++++-- utils/pdftohtml.1 | 5 +++++ utils/pdftohtml.cc | 7 +++++++ 3 files changed, 17 insertions(+), 2 deletions(-)
New commits: commit e5b914b2bfbb5e95ecde5f1ce148374b1d58dadd Author: Ihar Filipau <[email protected]> Date: Tue Mar 13 23:54:26 2012 +0100 Add possibilty of controlling word breaks percentage Bug #47022 diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc index 17541a2..19f1c84 100644 --- a/utils/HtmlOutputDev.cc +++ b/utils/HtmlOutputDev.cc @@ -31,6 +31,7 @@ // Copyright (C) 2011 Joshua Richardson <[email protected]> // Copyright (C) 2011 Stephen Reichling <[email protected]> // Copyright (C) 2011, 2012 Igor Slepchin <[email protected]> +// Copyright (C) 2012 Ihar Filipau <[email protected]> // // To see a description of the changes please see the Changelog file that // came with your tarball or type make ChangeLog if you are building from git @@ -96,6 +97,8 @@ extern GBool xml; extern GBool showHidden; extern GBool noMerge; +extern double wordBreakThreshold; + static GBool debug = gFalse; static GooString *gstr_buff0 = NULL; // a workspace in which I format strings @@ -379,7 +382,7 @@ void HtmlPage::addChar(GfxState *state, double x, double y, // right, which will not necessarily be the case, e.g. if rotated; // It assesses whether or not two characters are close enough to // be part of the same string - fabs(x1 - curStr->xRight[n-1]) > 0.1 * (curStr->yMax - curStr->yMin) && + fabs(x1 - curStr->xRight[n-1]) > wordBreakThreshold * (curStr->yMax - curStr->yMin) && // rotation is (cos q, sin q, -sin q, cos q, 0, 0) // sin q is zero iff there is no rotation, or 180 deg. rotation; // for 180 rotation, cos q will be negative @@ -625,7 +628,7 @@ void HtmlPage::coalesce() { { // printf("yes\n"); n = str1->len + str2->len; - if ((addSpace = horSpace > 0.1 * space)) { + if ((addSpace = horSpace > wordBreakThreshold * space)) { ++n; } if (addLineBreak) { diff --git a/utils/pdftohtml.1 b/utils/pdftohtml.1 index 6763bbe..44137e4 100644 --- a/utils/pdftohtml.1 +++ b/utils/pdftohtml.1 @@ -84,6 +84,11 @@ do not merge paragraphs .TP .B \-nodrm override document DRM settings +.TP +.B \-wbt <fp> +adjust the word break threshold percent. Default is 10. +Word break occurs when distance between two adjacent characters is +greater than this percent of character height. .SH AUTHOR diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc index 7347161..6735f5d 100644 --- a/utils/pdftohtml.cc +++ b/utils/pdftohtml.cc @@ -20,6 +20,7 @@ // Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey ([email protected]) and Onkar Potdar ([email protected]) // Copyright (C) 2011 Steven Murdoch <[email protected]> // Copyright (C) 2012 Igor Slepchin <[email protected]> +// Copyright (C) 2012 Ihar Filipau <[email protected]> // // To see a description of the changes please see the Changelog file that // came with your tarball or type make ChangeLog if you are building from git @@ -82,6 +83,7 @@ GBool stout=gFalse; GBool xml=gFalse; static GBool errQuiet=gFalse; static GBool noDrm=gFalse; +double wordBreakThreshold=10; // 10%, below converted into a coefficient - 0.1 GBool showHidden = gFalse; GBool noMerge = gFalse; @@ -142,6 +144,8 @@ static const ArgDesc argDesc[] = { "user password (for encrypted files)"}, {"-nodrm", argFlag, &noDrm, 0, "override document DRM settings"}, + {"-wbt", argFP, &wordBreakThreshold, 0, + "word break threshold (default 10 percent)"}, {NULL} }; @@ -221,6 +225,9 @@ int main(int argc, char *argv[]) { } } + // convert from user-friendly percents into a coefficient + wordBreakThreshold /= 100.0; + // open PDF file if (ownerPassword[0]) { ownerPW = new GooString(ownerPassword); _______________________________________________ poppler mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/poppler
