utils/HtmlOutputDev.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
New commits: commit 28a523d6485d3be3c2a606cc942c34536cd26b50 Author: Christopher Hasse <[email protected]> Date: Mon Sep 13 01:21:20 2021 -0500 Update pdftohtml duplicate detection The delta values used now are the same as the ones used in pdftotext, which have proven to be much more reliable. Additionally the search range on the xaxis for duplicate strings has been increased, which seems to vastly improve the ability to find duplicates. This algorithm can now properly detect duplicates as shown in #321. diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc index 1c2f26c3..9e832bc8 100644 --- a/utils/HtmlOutputDev.cc +++ b/utils/HtmlOutputDev.cc @@ -516,11 +516,11 @@ void HtmlPage::coalesce() bool found; while (str1) { double size = str1->yMax - str1->yMin; - double xLimit = str1->xMin + size * 0.175; + double xLimit = str1->xMin + size; found = false; for (str2 = str1, str3 = str1->yxNext; str3 && str3->xMin < xLimit; str2 = str3, str3 = str2->yxNext) { if (str3->len == str1->len && !memcmp(str3->text, str1->text, str1->len * sizeof(Unicode)) && fabs(str3->yMin - str1->yMin) < size * 0.2 && fabs(str3->yMax - str1->yMax) < size * 0.2 - && fabs(str3->xMax - str1->xMax) < size * 0.175) { + && fabs(str3->xMax - str1->xMax) < size * 0.1) { found = true; // printf("found duplicate!\n"); break; commit 94448a433c8690cb782ca9783d22e411e8d80e8d Author: Christopher Hasse <[email protected]> Date: Sun Sep 12 03:53:08 2021 -0500 pdftohtml: Reduce sensitivity of duplicate detection fixes #1117 In some fonts, strings such as "ll" or "ff" are placed close enough together to trigger duplicate detection in pdftohtml. This commit makes the detection algorithm less sensitive to reduce the false positives while still maintaining the original function of the code. Prior to this commit, if a character's `xMax` is less than 20% of its height away from the following character's `xMax`, it is treated as a duplicate and removed. This commit changes that value to 17.5%, which will reduce the number of false positives without introducing too many false negatives. diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc index d49ccf9e..1c2f26c3 100644 --- a/utils/HtmlOutputDev.cc +++ b/utils/HtmlOutputDev.cc @@ -516,11 +516,11 @@ void HtmlPage::coalesce() bool found; while (str1) { double size = str1->yMax - str1->yMin; - double xLimit = str1->xMin + size * 0.2; + double xLimit = str1->xMin + size * 0.175; found = false; for (str2 = str1, str3 = str1->yxNext; str3 && str3->xMin < xLimit; str2 = str3, str3 = str2->yxNext) { if (str3->len == str1->len && !memcmp(str3->text, str1->text, str1->len * sizeof(Unicode)) && fabs(str3->yMin - str1->yMin) < size * 0.2 && fabs(str3->yMax - str1->yMax) < size * 0.2 - && fabs(str3->xMax - str1->xMax) < size * 0.2) { + && fabs(str3->xMax - str1->xMax) < size * 0.175) { found = true; // printf("found duplicate!\n"); break;
