glib/poppler-page.cc    |    9 ++++++---
 glib/tests/check_text.c |   43 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 48 insertions(+), 4 deletions(-)

New commits:
commit fdb83a88ce196413a874c3e0fb6fbd200b56393c
Author: Nelson Benítez León <nbenit...@gmail.com>
Date:   Mon Jul 5 15:42:44 2021 -0400

    glib: mimick TextSelectionDumper logic change for spaceAfter
    
    Commit d6cccfb8d814d89c51c9e65563be2e475f46212b caused
    issue #1100 because that change in the TextSelectionDumper
    logic *must be mimicked* in poppler_page_get_text_layout_for_area()
    and in poppler_page_get_text_attributes_for_area() because
    all those functions must be consistent with each other in
    the way they traverse and extract the text from the PDF.
    
    Otherwise, wrong results may happen when using them
    to map between graphical coordinates of text glyphs and
    their corresponding positions in the text obtained from
    poppler_page_get_text() (which uses TextSelectionDumper
    to extract the text).
    
    Fixes issue #1100

diff --git a/glib/poppler-page.cc b/glib/poppler-page.cc
index e81c1e12..684cc07f 100644
--- a/glib/poppler-page.cc
+++ b/glib/poppler-page.cc
@@ -2332,8 +2332,11 @@ gboolean 
poppler_page_get_text_layout_for_area(PopplerPage *page, PopplerRectang
     for (i = 0; i < n_lines; i++) {
         std::vector<TextWordSelection *> *line_words = word_list[i];
         n_rects += line_words->size() - 1;
-        for (const TextWordSelection *word_sel : *line_words) {
+        for (std::size_t j = 0; j < line_words->size(); j++) {
+            const TextWordSelection *word_sel = (*line_words)[j];
             n_rects += word_sel->getEnd() - word_sel->getBegin();
+            if (!word_sel->getWord()->hasSpaceAfter() && j < 
line_words->size() - 1)
+                n_rects--;
         }
     }
 
@@ -2356,7 +2359,7 @@ gboolean 
poppler_page_get_text_layout_for_area(PopplerPage *page, PopplerRectang
             rect = *rectangles + offset;
             word->getBBox(&x1, &y1, &x2, &y2);
 
-            if (j < line_words->size() - 1) {
+            if (word->hasSpaceAfter() && j < line_words->size() - 1) {
                 TextWordSelection *next_word_sel = (*line_words)[j + 1];
 
                 next_word_sel->getWord()->getBBox(&x3, &y3, &x4, &y4);
@@ -2514,7 +2517,7 @@ GList 
*poppler_page_get_text_attributes_for_area(PopplerPage *page, PopplerRecta
                 prev_word_i = word_i;
             }
 
-            if (j < line_words->size() - 1) {
+            if (word->hasSpaceAfter() && j < line_words->size() - 1) {
                 attrs->end_index = offset;
                 offset++;
             }
commit e2f7f5e8eae0cb13d88af4400d68697c6e6bf5ed
Author: Nelson Benítez León <nbenit...@gmail.com>
Date:   Sat Jul 10 00:13:46 2021 -0400

    Add glib test for issue #1100

diff --git a/glib/tests/check_text.c b/glib/tests/check_text.c
index 8b10a7a0..9b0c5b61 100644
--- a/glib/tests/check_text.c
+++ b/glib/tests/check_text.c
@@ -16,6 +16,8 @@ int main(int argc, char *argv[])
     GFile *infile;
     PopplerDocument *doc;
     PopplerPage *page;
+    PopplerRectangle *areas = NULL;
+    guint n_glyph_areas, n_utf8_chars;
     int npages, n;
     char *text;
     GError *err = NULL;
@@ -48,7 +50,46 @@ int main(int argc, char *argv[])
     text = poppler_page_get_text(page);
     g_print("%s\n", text);
     g_assert_cmpstr(text, ==, "The slow brown fox jumps over the black dog.");
-    g_object_unref(page);
+
+    /* Cleanup vars for next test */
+    g_clear_object(&page);
+    g_clear_object(&doc);
+    g_clear_object(&infile);
+    g_clear_pointer(&text, g_free);
+
+    /* Test for consistency between utf8 characters returned by 
poppler_page_get_text()
+     * and glyph layout areas returned by poppler_page_get_text_layout(). 
Issue #1100 */
+    g_print("Consistency test between poppler_page_get_text() and 
poppler_page_get_text_layout()\n");
+    g_print("Issue #1100 \n");
+    infile = g_file_new_for_path(TESTDATADIR 
"/unittestcases/searchAcrossLines.pdf");
+    if (!infile)
+        exit(EXIT_FAILURE);
+
+    doc = poppler_document_new_from_gfile(infile, NULL, NULL, &err);
+    if (doc == NULL) {
+        g_printerr("error opening pdf file: %s\n", err->message);
+        g_error_free(err);
+        exit(EXIT_FAILURE);
+    }
+
+    page = poppler_document_get_page(doc, 0);
+    if (page == NULL || !POPPLER_IS_PAGE(page)) {
+        g_print("error opening pdf page\n");
+        exit(EXIT_FAILURE);
+    }
+
+    text = poppler_page_get_text(page);
+    n_utf8_chars = (guint)g_utf8_strlen(text, -1);
+    poppler_page_get_text_layout(page, &areas, &n_glyph_areas);
+    g_assert_cmpuint(n_glyph_areas, ==, n_utf8_chars);
+    g_print("Test: OK ('layout glyph areas' match amount of 'utf8 
characters')\n");
+
+    /* Cleanup vars for next test */
+    g_clear_object(&page);
+    g_clear_object(&doc);
+    g_clear_object(&infile);
+    g_clear_pointer(&areas, g_free);
+    g_clear_pointer(&text, g_free);
 
     return EXIT_SUCCESS;
 }
_______________________________________________
poppler mailing list
poppler@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/poppler

Reply via email to