Dear Adam, Thank you for the technical information on the difference of std::shared_ptr and std::unique_ptr. Just I've tried to replace the vector of the pointers to that of the unique_ptrs, and remove the next_xxx methods.
I attached the revised patch (gzipped) to the master, and here I quote the revised part from my last revision, for convenience to take a look on it. Regards, mpsuzuki diff --git a/cpp/poppler-private.h b/cpp/poppler-private.h index 726068a..2cedd1e 100644 --- a/cpp/poppler-private.h +++ b/cpp/poppler-private.h @@ -72,12 +72,11 @@ class text_box_data { public: text_box_data() - : next_text_box(0), has_space_after(false) + : has_space_after(false) { } ustring text; rectf bbox; - text_box *next_text_box; std::vector<rectf> char_bboxes; bool has_space_after; }; diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h index 18af213..3013997 100644 --- a/cpp/poppler-page.h +++ b/cpp/poppler-page.h @@ -22,6 +22,8 @@ #include "poppler-global.h" #include "poppler-rectangle.h" +#include <memory> + namespace poppler { @@ -33,12 +35,10 @@ class POPPLER_CPP_EXPORT text_box { ~text_box(); ustring text() const; rectf bbox() const; - text_box *next_text_box() const; - text_box *next_word() { return this->next_text_box(); }; rectf char_bbox(int i) const; bool has_space_after() const; private: - text_box_data* m_data; + std::unique_ptr<text_box_data> m_data; }; class document; @@ -79,7 +79,7 @@ public: ustring text(const rectf &rect = rectf()) const; ustring text(const rectf &rect, text_layout_enum layout_mode) const; - std::vector<text_box*> text_list(rotation_enum rotation) const; + std::vector<std::unique_ptr<text_box>> text_list(rotation_enum rotation) const; private: page(document_private *doc, int index); diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp index 9461ab9..5da2e1d 100644 --- a/cpp/poppler-page.cpp +++ b/cpp/poppler-page.cpp @@ -291,14 +291,13 @@ ustring page::text(const rectf &r, text_layout_enum layout_mode) const */ text_box::text_box(const ustring& text, const rectf &bbox) { - m_data = new text_box_data(); + m_data = std::unique_ptr<text_box_data>(new text_box_data()); m_data->text = text; m_data->bbox = bbox; } text_box::~text_box() { - delete m_data; } ustring text_box::text() const @@ -311,11 +310,6 @@ rectf text_box::bbox() const return m_data->bbox; } -text_box* text_box::next_text_box() const -{ - return m_data->next_text_box; -} - rectf text_box::char_bbox(int i) const { return m_data->char_bboxes[i]; @@ -326,10 +320,10 @@ bool text_box::has_space_after() const return m_data->has_space_after; } -std::vector<text_box*> page::text_list(rotation_enum rotate) const +std::vector<std::unique_ptr<text_box>> page::text_list(rotation_enum rotate) const { TextOutputDev *output_dev; - std::vector<text_box*> output_list; + std::vector<std::unique_ptr<text_box>> output_list; const int rotation_value = (int)rotate * 90; /* config values are same with Qt5 Page::TextList() */ @@ -366,7 +360,7 @@ std::vector<text_box*> page::text_list(rotation_enum rotate) const double xMin, yMin, xMax, yMax; word->getBBox(&xMin, &yMin, &xMax, &yMax); - text_box* tb = new text_box(ustr, rectf(xMin, yMin, xMax-xMin, yMax-yMin)); + std::unique_ptr<text_box> tb = std::unique_ptr<text_box>(new text_box(ustr, rectf(xMin, yMin, xMax-xMin, yMax-yMin))); tb->m_data->has_space_after = (word->hasSpaceAfter() == gTrue); tb->m_data->char_bboxes.reserve(word->getLength()); @@ -375,10 +369,7 @@ std::vector<text_box*> page::text_list(rotation_enum rotate) const tb->m_data->char_bboxes.push_back(rectf(xMin, yMin, xMax-xMin, yMax-yMin)); } - if (output_list.size() > 0) - output_list.back()->m_data->next_text_box = tb; - - output_list.push_back(tb); + output_list.push_back(std::move(tb)); } delete word_list; delete output_dev; diff --git a/cpp/tests/poppler-dump.cpp b/cpp/tests/poppler-dump.cpp index 09b180d..d26a0ef 100644 --- a/cpp/tests/poppler-dump.cpp +++ b/cpp/tests/poppler-dump.cpp @@ -333,7 +333,7 @@ static void print_page_text_list(poppler::page *p) std::cout << std::endl; return; } - std::vector<poppler::text_box*> text_list = p->text_list(poppler::rotate_0); + std::vector<std::unique_ptr<poppler::text_box>> text_list = p->text_list(poppler::rotate_0); std::cout << "---" << std::endl; for (size_t i = 0; i < text_list.size(); i ++) { @@ -343,7 +343,6 @@ static void print_page_text_list(poppler::page *p) std::cout << "( x=" << bbox.x() << " y=" << bbox.y() << " w=" << bbox.width() << " h=" << bbox.height() << " )"; std::cout << std::endl; - delete text_list[i]; } std::cout << "---" << std::endl; } Adam Reichold wrote: > Hello mpsuzuki, > > std::shared_ptr was introduced in C++11 and Poppler does use C++11 only > since a relatively short time, so I guess there just was no time to use > it anywhere. > > In any case it is a pretty heavy-handed solution due to the atomic > memory accesses on the reference counter which will slow down even > single-threaded programs which do not need it. Hence I would suggest > starting with std::unique_ptr to get a move-only type. > > Best regards, Adam. > > Am 02.01.2018 um 08:19 schrieb suzuki toshiya: >> Dear Albert, Adam, >> >> Thank you for the helpful review. I should have more >> training on C++... Please let me spend to improve the >> patch. >> >>> (Another easy option albeit with some >>> overhead to get automatic destruction would be to wrap the actual data >>> inside a std::shared_ptr and hence use reference counting.) >> Yeah, from some posts in stackoverflow, I found about >> shared_ptr... But it seems that current poppler does >> not use shared_ptr at all. Is this a coding policy? >> >> Regards, >> mpsuzuki >> >> Adam Reichold wrote: >>> Hello >>> >>> Am 02.01.2018 um 01:06 schrieb Albert Astals Cid: >>>> El diumenge, 31 de desembre de 2017, a les 0:45:16 CET, suzuki toshiya va >>>> escriure: >>>>> Hi, >>>>> >>>>> I've tried to implement the suggestion, I attached my current patch. >>>>> >>>>> As suggested, the most part is just copied from Qt frontend and renamed, >>>>> except of one point: TextBox.nextWord() looks slightly confusing, >>>>> because the returned object is a pointer to TextBox. I wrote >>>>> text_box.next_text_box() and a macro text_box.next_word() which >>>>> calls next_text_box() internally. >>>>> >>>>> Another point I want to discuss is the design of the list give by >>>>> poppler::page::text_list(). In Qt frontend, Page::textList() returns >>>>> QList<TextBox*>. For similarity, current patch returns >>>>> std::vector<text_box*> for similarity to Qt frontend. >>>>> >>>>> But, if we return the vector of pointers, the client should destruct >>>>> the objects pointed by the vector, before destructing vector itself. >>>>> Using a vector of text_box (not the pointer but the object itself), >>>>> like std::vector<text_box>, could be better, because the destructor >>>>> of the vector would internally call the destructor for text_box object. >>>>> (Qt has qDeleteAll(), but I think std::vector does not have such). >>>>> If I'm misunderstanding about C++, please correct. >>>> First you need to fix the text_box class, you either need to make it >>>> uncopiable (and thus making your suggestion not possible) or you need to >>>> make >>>> the copy and assignment operators not break your class since at the moment >>>> the >>>> default operators will just copy the pointer and everything will break >>>> because >>>> you will end up with a dangling pointer. >>>> >>>> If you go for the second option you'll want the move constructor and move >>>> operator too. >>> Should the first option also be viable if text_box becomes a move-only >>> type, i.e. no copying but move construction and assignment? I think >>> std::vector can handle it. (Another easy option albeit with some >>> overhead to get automatic destruction would be to wrap the actual data >>> inside a std::shared_ptr and hence use reference counting.) >>> >>> Best regards, Adam. >>> >>>> Cheers, >>>> Albert >>>> >>>>> Regards, >>>>> mpsuzuki >>>>> >>>>> Albert Astals Cid wrote: >>>>>> El dimecres, 27 de desembre de 2017, a les 12:26:25 CET, Jeroen Ooms va >>>>>> >>>>>> escriure: >>>>>>> Is there a method in poppler-cpp to extract text from a pdf document, >>>>>>> including the position of each text box? Currently we use page->text() >>>>>>> with page::physical_layout which gives all text per page, but I need >>>>>>> more detailed information about each text box per page. >>>>>> You want to code the variant of qt5 frontend Poppler::Page::textList() >>>>>> for >>>>>> cpp frontend, it shouldn't be that hard getting inspiration (i.e. >>>>>> almost-copying) the code, do you have time for it? >>>>>> >>>>>> Cheers, >>>>>> >>>>>> Albert >>>>>>> _______________________________________________ >>>>>>> poppler mailing list >>>>>>> poppler@lists.freedesktop.org >>>>>>> https://lists.freedesktop.org/mailman/listinfo/poppler >>>>>> _______________________________________________ >>>>>> poppler mailing list >>>>>> poppler@lists.freedesktop.org >>>>>> https://lists.freedesktop.org/mailman/listinfo/poppler >>>> _______________________________________________ >>>> poppler mailing list >>>> poppler@lists.freedesktop.org >>>> https://lists.freedesktop.org/mailman/listinfo/poppler >>>> >> >
add-cpp-text-list_from-master.diff.gz
Description: GNU Zip compressed data
_______________________________________________ poppler mailing list poppler@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/poppler