https://bz.apache.org/bugzilla/show_bug.cgi?id=64418

            Bug ID: 64418
           Summary: Finding text in textfields is very slow
           Product: POI
           Version: unspecified
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: normal
          Priority: P2
         Component: XWPF
          Assignee: [email protected]
          Reporter: [email protected]
  Target Milestone: ---

I am scanning docx documents for occurences of specific words / search terms. 

The code I am using is seen below.

The search terms can literally be anywhere: in header, footer, paragraphs,
tables, text fields, ...

When using an even complex document that uses no / very few textfields, parsing
takes a few seconds. As soon as multiple text fields are involved, parsing
takes a considerate amount of time, e.g. 30 seconds or even more than a minute.


Is there aynthing I am doing wrong in how I use the API, or is there an issue
with XWPF?

Thanks,
Jens



    private static void findInBodyElements(String key, List<IBodyElement>
bodyElements, ArrayList<String> resultList) {
        if (resultList.contains(key)) {
            return;
        }

        for (IBodyElement bodyElement : bodyElements) {
            if
(bodyElement.getElementType().compareTo(BodyElementType.PARAGRAPH) == 0) {
                findInParagraph(key, (XWPFParagraph) bodyElement, resultList);
                if (resultList.contains(key)) {
                    return;
                }
                findInTextfield(key, (XWPFParagraph) bodyElement, resultList);
                if (resultList.contains(key)) {
                    return;
                }

            }
            if (bodyElement.getElementType().compareTo(BodyElementType.TABLE)
== 0) {
                findInTable(key, (XWPFTable) bodyElement, resultList);

            }
        }
    }

    private static void findInParagraph(String key, XWPFParagraph
xwpfParagraph, ArrayList<String> resultList) {

        if (resultList.contains(key)) {
            return;
        }

        //for (XWPFParagraph paragraph : xwpfParagraphs) {
        List<XWPFRun> runs = xwpfParagraph.getRuns();

        String find = key;
        TextSegment found = xwpfParagraph.searchText(find, new
PositionInParagraph());
        if (found != null) {
            if (!resultList.contains(key)) {
                resultList.add(key);
                return;
            }
        }

    }

    private static void findInTextfield(String key, XWPFParagraph
xwpfParagraph, ArrayList<String> resultList) {

        if (resultList.contains(key)) {
            return;
        }

        XmlCursor cursor = xwpfParagraph.getCTP().newCursor();
        cursor.selectPath("declare namespace
w='http://schemas.openxmlformats.org/wordprocessingml/2006/main'
.//*/w:txbxContent/w:p/w:r");

        List<XmlObject> ctrsintxtbx = new ArrayList<XmlObject>();

        while (cursor.hasNextSelection()) {
            cursor.toNextSelection();
            XmlObject obj = cursor.getObject();
            ctrsintxtbx.add(obj);
        }
        for (XmlObject obj : ctrsintxtbx) {
            try {
                CTR ctr = CTR.Factory.parse(obj.xmlText());
                XWPFRun bufferrun = new XWPFRun(ctr, (IRunBody) xwpfParagraph);
                String text = bufferrun.getText(0);
                if (text != null && text.contains(key)) {
                    if (!resultList.contains(key)) {
                        resultList.add(key);
                        return;
                    }
                }
            } catch (Exception ex) {
                log.error("Unable to iterate text fields", ex);
            }
        }

    }

-- 
You are receiving this mail because:
You are the assignee for the bug.
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to