glib/poppler-page.cc | 9 ++++++--- glib/tests/check_text.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 4 deletions(-)
New commits: commit fdb83a88ce196413a874c3e0fb6fbd200b56393c Author: Nelson Benítez León <nbenit...@gmail.com> Date: Mon Jul 5 15:42:44 2021 -0400 glib: mimick TextSelectionDumper logic change for spaceAfter Commit d6cccfb8d814d89c51c9e65563be2e475f46212b caused issue #1100 because that change in the TextSelectionDumper logic *must be mimicked* in poppler_page_get_text_layout_for_area() and in poppler_page_get_text_attributes_for_area() because all those functions must be consistent with each other in the way they traverse and extract the text from the PDF. Otherwise, wrong results may happen when using them to map between graphical coordinates of text glyphs and their corresponding positions in the text obtained from poppler_page_get_text() (which uses TextSelectionDumper to extract the text). Fixes issue #1100 diff --git a/glib/poppler-page.cc b/glib/poppler-page.cc index e81c1e12..684cc07f 100644 --- a/glib/poppler-page.cc +++ b/glib/poppler-page.cc @@ -2332,8 +2332,11 @@ gboolean poppler_page_get_text_layout_for_area(PopplerPage *page, PopplerRectang for (i = 0; i < n_lines; i++) { std::vector<TextWordSelection *> *line_words = word_list[i]; n_rects += line_words->size() - 1; - for (const TextWordSelection *word_sel : *line_words) { + for (std::size_t j = 0; j < line_words->size(); j++) { + const TextWordSelection *word_sel = (*line_words)[j]; n_rects += word_sel->getEnd() - word_sel->getBegin(); + if (!word_sel->getWord()->hasSpaceAfter() && j < line_words->size() - 1) + n_rects--; } } @@ -2356,7 +2359,7 @@ gboolean poppler_page_get_text_layout_for_area(PopplerPage *page, PopplerRectang rect = *rectangles + offset; word->getBBox(&x1, &y1, &x2, &y2); - if (j < line_words->size() - 1) { + if (word->hasSpaceAfter() && j < line_words->size() - 1) { TextWordSelection *next_word_sel = (*line_words)[j + 1]; next_word_sel->getWord()->getBBox(&x3, &y3, &x4, &y4); @@ -2514,7 +2517,7 @@ GList *poppler_page_get_text_attributes_for_area(PopplerPage *page, PopplerRecta prev_word_i = word_i; } - if (j < line_words->size() - 1) { + if (word->hasSpaceAfter() && j < line_words->size() - 1) { attrs->end_index = offset; offset++; } commit e2f7f5e8eae0cb13d88af4400d68697c6e6bf5ed Author: Nelson Benítez León <nbenit...@gmail.com> Date: Sat Jul 10 00:13:46 2021 -0400 Add glib test for issue #1100 diff --git a/glib/tests/check_text.c b/glib/tests/check_text.c index 8b10a7a0..9b0c5b61 100644 --- a/glib/tests/check_text.c +++ b/glib/tests/check_text.c @@ -16,6 +16,8 @@ int main(int argc, char *argv[]) GFile *infile; PopplerDocument *doc; PopplerPage *page; + PopplerRectangle *areas = NULL; + guint n_glyph_areas, n_utf8_chars; int npages, n; char *text; GError *err = NULL; @@ -48,7 +50,46 @@ int main(int argc, char *argv[]) text = poppler_page_get_text(page); g_print("%s\n", text); g_assert_cmpstr(text, ==, "The slow brown fox jumps over the black dog."); - g_object_unref(page); + + /* Cleanup vars for next test */ + g_clear_object(&page); + g_clear_object(&doc); + g_clear_object(&infile); + g_clear_pointer(&text, g_free); + + /* Test for consistency between utf8 characters returned by poppler_page_get_text() + * and glyph layout areas returned by poppler_page_get_text_layout(). Issue #1100 */ + g_print("Consistency test between poppler_page_get_text() and poppler_page_get_text_layout()\n"); + g_print("Issue #1100 \n"); + infile = g_file_new_for_path(TESTDATADIR "/unittestcases/searchAcrossLines.pdf"); + if (!infile) + exit(EXIT_FAILURE); + + doc = poppler_document_new_from_gfile(infile, NULL, NULL, &err); + if (doc == NULL) { + g_printerr("error opening pdf file: %s\n", err->message); + g_error_free(err); + exit(EXIT_FAILURE); + } + + page = poppler_document_get_page(doc, 0); + if (page == NULL || !POPPLER_IS_PAGE(page)) { + g_print("error opening pdf page\n"); + exit(EXIT_FAILURE); + } + + text = poppler_page_get_text(page); + n_utf8_chars = (guint)g_utf8_strlen(text, -1); + poppler_page_get_text_layout(page, &areas, &n_glyph_areas); + g_assert_cmpuint(n_glyph_areas, ==, n_utf8_chars); + g_print("Test: OK ('layout glyph areas' match amount of 'utf8 characters')\n"); + + /* Cleanup vars for next test */ + g_clear_object(&page); + g_clear_object(&doc); + g_clear_object(&infile); + g_clear_pointer(&areas, g_free); + g_clear_pointer(&text, g_free); return EXIT_SUCCESS; } _______________________________________________ poppler mailing list poppler@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/poppler