Hi, I'm parsing some PDF files by following code:
Use this PDF, for example. https://www.sedl.org/afterschool/toolkits/science/pdf/ast_sci_data_tables_sample.pdf #include <stdio.h> #include <stdlib.h> #include <glib/poppler.h> int main(void) { GError *error = NULL; PopplerDocument *doc = poppler_document_new_from_file("file:///path/to/ast_sci_data_tables_sample.pdf", NULL, &error); if (!doc) { printf("%s\n", error->message); return 0; } PopplerPage *page = poppler_document_get_page(doc, 1); // char *content = poppler_page_get_text(page); char *content; double height; poppler_page_get_size(page, NULL, &height); GList *list = poppler_page_find_text(page, "Wingfoot Express"); for (GList *node = list; node != NULL; node = node->next) { PopplerRectangle *rec = (PopplerRectangle *)node->data; content = poppler_page_get_text_for_area(page, rec); printf("%s\n", content); // Displays unexpected wrong text printf("x1=%f, y1=%f, x2=%f, y2=%f\n", rec->x1, rec->y1, rec->x2, rec->y2); rec->y1 = height - rec->y1; rec->y2 = height - rec->y2; content = poppler_page_get_text_for_area(page, rec); printf("%s\n", content); // Displays expected text printf("x1=%f, y1=%f, x2=%f, y2=%f\n", rec->x1, rec->y1, rec->x2, rec->y2); } return 0; } I'm confusing that I cannot use PopplerRectangle returned by poppler_page_find_text() with poppler_page_get_text_for_area(). I think that poppler_page_get_text_for_area() should return proper text without editing poppler_page_find_text() result. In C++ version, I can use poppler::rectangle returned by page->search() with page->text(). #include <iostream> #include <cstdlib> #include <cstring> #include <memory> #include <iostream> #include <sstream> #include <poppler-document.h> #include <poppler-page.h> #include <poppler-rectangle.h> int main(int argx, char **argv) { const std::string path("/path/to/ast_sci_data_tables_sample.pdf"); poppler::document *doc = poppler::document::load_from_file(path); poppler::page *page = doc->create_page(1); poppler::rectangle <double>r; poppler::ustring text = poppler::ustring::from_latin1("Wingfoot Express"); bool matched = page->search(text, r, poppler::page::search_direction_enum::search_from_top, poppler::case_sensitivity_enum::case_sensitive); std::cout << text.to_latin1() << std::endl << matched << std::endl << r.x() << "," << r.y() << std::endl << r.left() << "," << r.right() << std::endl << r.top() << "," << r.bottom() << std::endl << r.width() << "," << r.height() << std::endl << r << std::endl; poppler::ustring t = page->text(r); std::cout << t.to_latin1() << std::endl; return 0; } Thanks -- Kenji Okimoto <[email protected]> _______________________________________________ poppler mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/poppler
