Repository: james-project Updated Branches: refs/heads/master 1925eebdb -> f19648a51
JAMES-2018 Manage list levels well Project: http://git-wip-us.apache.org/repos/asf/james-project/repo Commit: http://git-wip-us.apache.org/repos/asf/james-project/commit/49fbba73 Tree: http://git-wip-us.apache.org/repos/asf/james-project/tree/49fbba73 Diff: http://git-wip-us.apache.org/repos/asf/james-project/diff/49fbba73 Branch: refs/heads/master Commit: 49fbba73a3f5a599c6a29bf31b9fcd9b627ccf8b Parents: 7203945 Author: benwa <btell...@linagora.com> Authored: Tue May 30 17:23:12 2017 +0700 Committer: benwa <btell...@linagora.com> Committed: Thu Jun 1 16:03:20 2017 +0700 ---------------------------------------------------------------------- .../jmap/utils/JsoupHtmlTextExtractor.java | 59 ++++++++++++++++---- .../jmap/utils/JsoupHtmlTextExtractorTest.java | 22 ++++++++ 2 files changed, 71 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/james-project/blob/49fbba73/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java ---------------------------------------------------------------------- diff --git a/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java b/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java index c153ae5..d5b359e 100644 --- a/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java +++ b/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java @@ -22,6 +22,7 @@ package org.apache.james.jmap.utils; import java.util.Optional; import java.util.stream.Stream; +import org.apache.commons.lang.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -40,6 +41,7 @@ public class JsoupHtmlTextExtractor implements HtmlTextExtractor { public static final String P_TAG = "p"; public static final String IMG_TAG = "img"; public static final String ALT_TAG = "alt"; + public static final int INITIAL_LIST_NESTED_LEVEL = 0; @Override public String toPlainText(String html) { @@ -48,7 +50,7 @@ public class JsoupHtmlTextExtractor implements HtmlTextExtractor { Element body = Optional.ofNullable(document.body()).orElse(document); - return flatten(body) + return flatten(body, INITIAL_LIST_NESTED_LEVEL) .map(this::convertNodeToText) .reduce("", (s1, s2) -> s1 + s2); } catch (Exception e) { @@ -57,7 +59,8 @@ public class JsoupHtmlTextExtractor implements HtmlTextExtractor { } } - private String convertNodeToText(Node node) { + private String convertNodeToText(HTMLNode htmlNode) { + Node node = htmlNode.underlyingNode; if (node instanceof TextNode) { TextNode textNode = (TextNode) node; return textNode.getWholeText(); @@ -67,14 +70,14 @@ public class JsoupHtmlTextExtractor implements HtmlTextExtractor { if (element.tagName().equals(BR_TAG)) { return "\n"; } - if (element.tagName().equals(UL_TAG)) { - return "\n\n"; + if (isList(element)) { + return convertListElement(htmlNode.listNestedLevel); } if (element.tagName().equals(OL_TAG)) { return "\n\n"; } if (element.tagName().equals(LI_TAG)) { - return "\n - "; + return "\n" + StringUtils.repeat(" ", htmlNode.listNestedLevel) + "- "; } if (element.tagName().equals(P_TAG)) { return "\n\n"; @@ -86,21 +89,47 @@ public class JsoupHtmlTextExtractor implements HtmlTextExtractor { return ""; } - Stream<Node> flatten(Node base) { + private String convertListElement(int nestedLevel) { + if (nestedLevel == 0) { + return "\n\n"; + } else { + return ""; + } + } + + Stream<HTMLNode> flatten(Node base, int listNestedLevel) { Position position = getPosition(base); - Stream<Node> flatChildren = base.childNodes() + int nextElementLevel = getNewNestedLevel(listNestedLevel, base); + + Stream<HTMLNode> baseStream = Stream.of(new HTMLNode(base, listNestedLevel)); + Stream<HTMLNode> flatChildren = base.childNodes() .stream() - .flatMap(this::flatten); + .flatMap(node -> flatten(node, nextElementLevel)); + switch (position) { case PREFIX: - return Stream.concat(Stream.of(base), flatChildren); + return Stream.concat(baseStream, flatChildren); case SUFFIX: - return Stream.concat(flatChildren, Stream.of(base)); + return Stream.concat(flatChildren, baseStream); default: throw new RuntimeException("Unexpected POSITION for node element: " + position); } } + private int getNewNestedLevel(int listNestedLevel, Node node) { + if (node instanceof Element) { + Element element = (Element) node; + if (isList(element)) { + return listNestedLevel + 1; + } + } + return listNestedLevel; + } + + private boolean isList(Element element) { + return element.tagName().equals(UL_TAG) || element.tagName().equals(OL_TAG); + } + private enum Position { PREFIX, SUFFIX @@ -116,4 +145,14 @@ public class JsoupHtmlTextExtractor implements HtmlTextExtractor { return Position.SUFFIX; } + private static class HTMLNode { + private final Node underlyingNode; + private final int listNestedLevel; + + public HTMLNode(Node underlyingNode, int listNestedLevel) { + this.underlyingNode = underlyingNode; + this.listNestedLevel = listNestedLevel; + } + } + } http://git-wip-us.apache.org/repos/asf/james-project/blob/49fbba73/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java ---------------------------------------------------------------------- diff --git a/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java b/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java index 75ba62a..30e858a 100644 --- a/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java +++ b/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java @@ -155,6 +155,28 @@ public class JsoupHtmlTextExtractorTest { } @Test + public void nestedListsShouldBeWellHandled() { + String html = " <ul>" + + " <li>Coffee</li>" + + " <li>Tea" + + " <ul>" + + " <li>Black tea</li>" + + " <li>Green tea</li>" + + " </ul>" + + " </li>" + + " <li>Milk</li>" + + "</ul>"; + String expectedPlainText = " \n" + + " - Coffee \n" + + " - Tea \n" + + " - Black tea \n" + + " - Green tea \n" + + " - Milk\n" + + "\n"; + assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); + } + + @Test public void nonClosedHtmlShouldBeTranslated() { String html = "This is an <b>HTML text !"; String expectedPlainText = "This is an HTML text !"; --------------------------------------------------------------------- To unsubscribe, e-mail: server-dev-unsubscr...@james.apache.org For additional commands, e-mail: server-dev-h...@james.apache.org