https://bz.apache.org/bugzilla/show_bug.cgi?id=64418
Bug ID: 64418
Summary: Finding text in textfields is very slow
Product: POI
Version: unspecified
Hardware: PC
OS: Linux
Status: NEW
Severity: normal
Priority: P2
Component: XWPF
Assignee: [email protected]
Reporter: [email protected]
Target Milestone: ---
I am scanning docx documents for occurences of specific words / search terms.
The code I am using is seen below.
The search terms can literally be anywhere: in header, footer, paragraphs,
tables, text fields, ...
When using an even complex document that uses no / very few textfields, parsing
takes a few seconds. As soon as multiple text fields are involved, parsing
takes a considerate amount of time, e.g. 30 seconds or even more than a minute.
Is there aynthing I am doing wrong in how I use the API, or is there an issue
with XWPF?
Thanks,
Jens
private static void findInBodyElements(String key, List<IBodyElement>
bodyElements, ArrayList<String> resultList) {
if (resultList.contains(key)) {
return;
}
for (IBodyElement bodyElement : bodyElements) {
if
(bodyElement.getElementType().compareTo(BodyElementType.PARAGRAPH) == 0) {
findInParagraph(key, (XWPFParagraph) bodyElement, resultList);
if (resultList.contains(key)) {
return;
}
findInTextfield(key, (XWPFParagraph) bodyElement, resultList);
if (resultList.contains(key)) {
return;
}
}
if (bodyElement.getElementType().compareTo(BodyElementType.TABLE)
== 0) {
findInTable(key, (XWPFTable) bodyElement, resultList);
}
}
}
private static void findInParagraph(String key, XWPFParagraph
xwpfParagraph, ArrayList<String> resultList) {
if (resultList.contains(key)) {
return;
}
//for (XWPFParagraph paragraph : xwpfParagraphs) {
List<XWPFRun> runs = xwpfParagraph.getRuns();
String find = key;
TextSegment found = xwpfParagraph.searchText(find, new
PositionInParagraph());
if (found != null) {
if (!resultList.contains(key)) {
resultList.add(key);
return;
}
}
}
private static void findInTextfield(String key, XWPFParagraph
xwpfParagraph, ArrayList<String> resultList) {
if (resultList.contains(key)) {
return;
}
XmlCursor cursor = xwpfParagraph.getCTP().newCursor();
cursor.selectPath("declare namespace
w='http://schemas.openxmlformats.org/wordprocessingml/2006/main'
.//*/w:txbxContent/w:p/w:r");
List<XmlObject> ctrsintxtbx = new ArrayList<XmlObject>();
while (cursor.hasNextSelection()) {
cursor.toNextSelection();
XmlObject obj = cursor.getObject();
ctrsintxtbx.add(obj);
}
for (XmlObject obj : ctrsintxtbx) {
try {
CTR ctr = CTR.Factory.parse(obj.xmlText());
XWPFRun bufferrun = new XWPFRun(ctr, (IRunBody) xwpfParagraph);
String text = bufferrun.getText(0);
if (text != null && text.contains(key)) {
if (!resultList.contains(key)) {
resultList.add(key);
return;
}
}
} catch (Exception ex) {
log.error("Unable to iterate text fields", ex);
}
}
}
--
You are receiving this mail because:
You are the assignee for the bug.
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]