JAMES-2018 Jsoup text extractor should well format lists
Project: http://git-wip-us.apache.org/repos/asf/james-project/repo Commit: http://git-wip-us.apache.org/repos/asf/james-project/commit/0172d4b8 Tree: http://git-wip-us.apache.org/repos/asf/james-project/tree/0172d4b8 Diff: http://git-wip-us.apache.org/repos/asf/james-project/diff/0172d4b8 Branch: refs/heads/master Commit: 0172d4b8634aba7e8e6e322b7aae0cdc2e1fe155 Parents: 84d7a31 Author: benwa <btell...@linagora.com> Authored: Thu May 25 10:00:18 2017 +0700 Committer: benwa <btell...@linagora.com> Committed: Thu Jun 1 16:03:20 2017 +0700 ---------------------------------------------------------------------- .../jmap/utils/JsoupHtmlTextExtractor.java | 46 +++++++++++++--- .../jmap/utils/JsoupHtmlTextExtractorTest.java | 58 ++++++++++++++++++++ 2 files changed, 97 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/james-project/blob/0172d4b8/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java ---------------------------------------------------------------------- diff --git a/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java b/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java index 912a617..a3ed036 100644 --- a/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java +++ b/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java @@ -33,6 +33,10 @@ import org.slf4j.LoggerFactory; public class JsoupHtmlTextExtractor implements HtmlTextExtractor { private static final Logger LOGGER = LoggerFactory.getLogger(JsoupHtmlTextExtractor.class); + public static final String BR_TAG = "br"; + public static final String UL_TAG = "ul"; + public static final String LI_TAG = "li"; + public static final String P_TAG = "p"; @Override public String toPlainText(String html) { @@ -57,10 +61,16 @@ public class JsoupHtmlTextExtractor implements HtmlTextExtractor { } if (node instanceof Element) { Element element = (Element) node; - if (element.tagName().equals("br")) { + if (element.tagName().equals(BR_TAG)) { return "\n"; } - if (element.tagName().equals("p")) { + if (element.tagName().equals(UL_TAG)) { + return "\n\n"; + } + if (element.tagName().equals(LI_TAG)) { + return "\n - "; + } + if (element.tagName().equals(P_TAG)) { return "\n\n"; } } @@ -68,11 +78,33 @@ public class JsoupHtmlTextExtractor implements HtmlTextExtractor { } Stream<Node> flatten(Node base) { - return Stream.concat( - base.childNodes() - .stream() - .flatMap(this::flatten), - Stream.of(base)); + Position position = getPosition(base); + Stream<Node> flatChildren = base.childNodes() + .stream() + .flatMap(this::flatten); + switch (position) { + case PREFIX: + return Stream.concat(Stream.of(base), flatChildren); + case SUFFIX: + return Stream.concat(flatChildren, Stream.of(base)); + default: + throw new RuntimeException("Unexpected POSITION for node element: " + position); + } + } + + private enum Position { + PREFIX, + SUFFIX + } + + private Position getPosition(Node node) { + if (node instanceof Element) { + Element element = (Element) node; + if (element.tagName().equals(LI_TAG)) { + return Position.PREFIX; + } + } + return Position.SUFFIX; } } http://git-wip-us.apache.org/repos/asf/james-project/blob/0172d4b8/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java ---------------------------------------------------------------------- diff --git a/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java b/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java index 28e9d1d..4a413de 100644 --- a/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java +++ b/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java @@ -71,6 +71,64 @@ public class JsoupHtmlTextExtractorTest { } @Test + public void toPlainTextShouldHandleListsWell() { + String html = "<ul>Here is my awesome list:" + + " <li>JMAP</li>" + + " <li>IMAP</li>" + + "</ul>" + + "<p>Followed with some text</p>" + + "<p>And some other text</p>"; + String expectedPlainText = "Here is my awesome list: \n" + + " - JMAP \n" + + " - IMAP\n" + + "\n" + + "Followed with some text\n" + + "\n" + + "And some other text\n" + + "\n"; + assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); + } + + @Test + public void tableShouldBeWellHandled() { + String html = " <table style=\"width:100%\">\n" + + " <tr>\n" + + " <th>Firstname</th>\n" + + " <th>Lastname</th>\n" + + " <th>Age</th>\n" + + " </tr>\n" + + " <tr>\n" + + " <td>Jill</td>\n" + + " <td>Smith</td>\n" + + " <td>50</td>\n" + + " </tr>\n" + + " <tr>\n" + + " <td>Eve</td>\n" + + " <td>Jackson</td>\n" + + " <td>94</td>\n" + + " </tr>\n" + + "</table> "; + String expectedPlainText = "\n" + + " \n" + + " Firstname\n" + + " Lastname\n" + + " Age\n" + + " \n" + + " \n" + + " Jill\n" + + " Smith\n" + + " 50\n" + + " \n" + + " \n" + + " Eve\n" + + " Jackson\n" + + " 94\n" + + " \n" + + " "; + assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); + } + + @Test public void nonClosedHtmlShouldBeTranslated() { String html = "This is an <b>HTML text !"; String expectedPlainText = "This is an HTML text !"; --------------------------------------------------------------------- To unsubscribe, e-mail: server-dev-unsubscr...@james.apache.org For additional commands, e-mail: server-dev-h...@james.apache.org