This is an automated email from the ASF dual-hosted git repository. btellier pushed a commit to branch 3.7.x in repository https://gitbox.apache.org/repos/asf/james-project.git
commit 372f1f83b6825fb0f92147803a9bf215b8ff690d Author: Benoit TELLIER <btell...@linagora.com> AuthorDate: Tue Feb 7 16:23:47 2023 +0700 [PERF] JsoupHtmlTextExtractor without recursion (#1422) (cherry picked from commit 537ae380f9837f74c075f0ed2b625affa9b20122) --- .../jmap/draft/utils/JsoupHtmlTextExtractor.java | 57 +++++++++++++++------- .../draft/utils/JsoupHtmlTextExtractorTest.java | 10 ++++ 2 files changed, 50 insertions(+), 17 deletions(-) diff --git a/server/protocols/jmap-draft/src/main/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractor.java b/server/protocols/jmap-draft/src/main/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractor.java index 4fb4204e7c..41fbe88b65 100644 --- a/server/protocols/jmap-draft/src/main/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractor.java +++ b/server/protocols/jmap-draft/src/main/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractor.java @@ -19,13 +19,16 @@ package org.apache.james.jmap.draft.utils; +import java.util.Deque; import java.util.Optional; +import java.util.concurrent.ConcurrentLinkedDeque; import java.util.function.Predicate; import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.commons.lang3.StringUtils; import org.apache.james.util.html.HtmlTextExtractor; +import org.apache.james.util.streams.Iterators; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -55,7 +58,7 @@ public class JsoupHtmlTextExtractor implements HtmlTextExtractor { Element body = Optional.ofNullable(document.body()).orElse(document); - return flatten(body, INITIAL_LIST_NESTED_LEVEL) + return flatten(body) .map(this::convertNodeToText) .collect(Collectors.joining()); } catch (Exception e) { @@ -110,23 +113,31 @@ public class JsoupHtmlTextExtractor implements HtmlTextExtractor { } } - Stream<HTMLNode> flatten(Node base, int listNestedLevel) { - Position position = getPosition(base); - int nextElementLevel = getNewNestedLevel(listNestedLevel, base); - - Stream<HTMLNode> baseStream = Stream.of(new HTMLNode(base, listNestedLevel)); - Stream<HTMLNode> flatChildren = base.childNodes() - .stream() - .flatMap(node -> flatten(node, nextElementLevel)); - - switch (position) { - case PREFIX: - return Stream.concat(baseStream, flatChildren); - case SUFFIX: - return Stream.concat(flatChildren, baseStream); - default: - throw new RuntimeException("Unexpected POSITION for node element: " + position); + Stream<HTMLNode> flatten(Node base) { + Deque<HTMLNode> in = new ConcurrentLinkedDeque<>(); + in.addFirst(new HTMLNode(base, JsoupHtmlTextExtractor.INITIAL_LIST_NESTED_LEVEL)); + Deque<HTMLNode> out = new ConcurrentLinkedDeque<>(); + + while (!in.isEmpty()) { + HTMLNode node = in.removeFirst(); + if (node.isDone) { + out.addLast(node); + continue; + } + int nextElementLevel = getNewNestedLevel(node.listNestedLevel, node.underlyingNode); + Position position = getPosition(node.underlyingNode); + + if (position == Position.SUFFIX) { + node.underlyingNode.childNodes() + .forEach(child -> in.addFirst(new HTMLNode(child, nextElementLevel))); + out.addLast(node); + } else { + in.addFirst(node.done()); + node.underlyingNode.childNodes() + .forEach(child -> in.addFirst(new HTMLNode(child, nextElementLevel))); + } } + return Iterators.toStream(out.descendingIterator()); } private int getNewNestedLevel(int listNestedLevel, Node node) { @@ -161,10 +172,22 @@ public class JsoupHtmlTextExtractor implements HtmlTextExtractor { private static class HTMLNode { private final Node underlyingNode; private final int listNestedLevel; + private final boolean isDone; + + public HTMLNode(Node underlyingNode, int listNestedLevel, boolean isDone) { + this.underlyingNode = underlyingNode; + this.listNestedLevel = listNestedLevel; + this.isDone = isDone; + } public HTMLNode(Node underlyingNode, int listNestedLevel) { this.underlyingNode = underlyingNode; this.listNestedLevel = listNestedLevel; + this.isDone = false; + } + + public HTMLNode done() { + return new HTMLNode(underlyingNode, listNestedLevel, true); } } diff --git a/server/protocols/jmap-draft/src/test/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractorTest.java b/server/protocols/jmap-draft/src/test/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractorTest.java index 86197a3b93..4829c00c43 100644 --- a/server/protocols/jmap-draft/src/test/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractorTest.java +++ b/server/protocols/jmap-draft/src/test/java/org/apache/james/jmap/draft/utils/JsoupHtmlTextExtractorTest.java @@ -27,6 +27,8 @@ import org.apache.commons.io.IOUtils; import org.junit.Before; import org.junit.Test; +import com.google.common.base.Strings; + public class JsoupHtmlTextExtractorTest { private JsoupHtmlTextExtractor textExtractor; @@ -63,6 +65,14 @@ public class JsoupHtmlTextExtractorTest { assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); } + @Test + public void deeplyNestedHtmlShouldNotThrowStackOverflow() { + final int count = 2048; + String html = Strings.repeat("<div>", count) + "<p>para1</p><p>para2</p>" + Strings.repeat("</div>", count); + String expectedPlainText = "para1\n\npara2\n\n"; + assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); + } + @Test public void toPlainTextShouldConciderUpperCaseLabelsAsLowerCase() { String html = "<P>para1</P><p>para2</p>"; --------------------------------------------------------------------- To unsubscribe, e-mail: notifications-unsubscr...@james.apache.org For additional commands, e-mail: notifications-h...@james.apache.org