JAMES-2018 Use Jsoup in JMAP project
Project: http://git-wip-us.apache.org/repos/asf/james-project/repo Commit: http://git-wip-us.apache.org/repos/asf/james-project/commit/84d7a317 Tree: http://git-wip-us.apache.org/repos/asf/james-project/tree/84d7a317 Diff: http://git-wip-us.apache.org/repos/asf/james-project/diff/84d7a317 Branch: refs/heads/master Commit: 84d7a3177622119717e46b9c09ea1b545b5f99b4 Parents: 1925eeb Author: benwa <btell...@linagora.com> Authored: Wed May 24 18:17:17 2017 +0700 Committer: benwa <btell...@linagora.com> Committed: Thu Jun 1 16:03:20 2017 +0700 ---------------------------------------------------------------------- .../java/org/apache/james/jmap/JMAPModule.java | 6 +- server/protocols/jmap/pom.xml | 5 + .../jmap/utils/JsoupHtmlTextExtractor.java | 78 ++++++++++++ .../utils/MailboxBasedHtmlTextExtractor.java | 33 ----- .../jmap/methods/GetMessagesMethodTest.java | 5 +- .../james/jmap/model/MessageFactoryTest.java | 5 +- .../jmap/utils/JsoupHtmlTextExtractorTest.java | 119 +++++++++++++++++++ .../MailboxBasedHtmlTextExtractorTest.java | 112 ----------------- 8 files changed, 209 insertions(+), 154 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/james-project/blob/84d7a317/server/container/guice/protocols/jmap/src/main/java/org/apache/james/jmap/JMAPModule.java ---------------------------------------------------------------------- diff --git a/server/container/guice/protocols/jmap/src/main/java/org/apache/james/jmap/JMAPModule.java b/server/container/guice/protocols/jmap/src/main/java/org/apache/james/jmap/JMAPModule.java index a43b0d6..a04a866 100644 --- a/server/container/guice/protocols/jmap/src/main/java/org/apache/james/jmap/JMAPModule.java +++ b/server/container/guice/protocols/jmap/src/main/java/org/apache/james/jmap/JMAPModule.java @@ -32,7 +32,7 @@ import org.apache.james.jmap.mailet.VacationMailet; import org.apache.james.jmap.methods.RequestHandler; import org.apache.james.jmap.send.PostDequeueDecoratorFactory; import org.apache.james.jmap.utils.HtmlTextExtractor; -import org.apache.james.jmap.utils.MailboxBasedHtmlTextExtractor; +import org.apache.james.jmap.utils.JsoupHtmlTextExtractor; import org.apache.james.jmap.utils.SystemMailboxesProvider; import org.apache.james.jmap.utils.SystemMailboxesProviderImpl; import org.apache.james.jwt.JwtConfiguration; @@ -84,10 +84,10 @@ public class JMAPModule extends AbstractModule { bind(JMAPServer.class).in(Scopes.SINGLETON); bind(RequestHandler.class).in(Scopes.SINGLETON); bind(UploadHandler.class).in(Scopes.SINGLETON); - bind(MailboxBasedHtmlTextExtractor.class).in(Scopes.SINGLETON); + bind(JsoupHtmlTextExtractor.class).in(Scopes.SINGLETON); bind(SystemMailboxesProviderImpl.class).in(Scopes.SINGLETON); - bind(HtmlTextExtractor.class).to(MailboxBasedHtmlTextExtractor.class); + bind(HtmlTextExtractor.class).to(JsoupHtmlTextExtractor.class); Multibinder.newSetBinder(binder(), ConfigurationPerformer.class).addBinding().to(RequiredCapabilitiesPrecondition.class); Multibinder<CamelMailetContainerModule.TransportProcessorCheck> transportProcessorChecks = Multibinder.newSetBinder(binder(), CamelMailetContainerModule.TransportProcessorCheck.class); http://git-wip-us.apache.org/repos/asf/james-project/blob/84d7a317/server/protocols/jmap/pom.xml ---------------------------------------------------------------------- diff --git a/server/protocols/jmap/pom.xml b/server/protocols/jmap/pom.xml index cf74ca9..4546b50 100644 --- a/server/protocols/jmap/pom.xml +++ b/server/protocols/jmap/pom.xml @@ -359,6 +359,11 @@ <version>0.9.1</version> </dependency> <dependency> + <groupId>org.jsoup</groupId> + <artifactId>jsoup</artifactId> + <version>1.9.2</version> + </dependency> + <dependency> <groupId>org.mockito</groupId> <artifactId>mockito-core</artifactId> <scope>test</scope> http://git-wip-us.apache.org/repos/asf/james-project/blob/84d7a317/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java ---------------------------------------------------------------------- diff --git a/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java b/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java new file mode 100644 index 0000000..912a617 --- /dev/null +++ b/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java @@ -0,0 +1,78 @@ +/**************************************************************** + O * Licensed to the Apache Software Foundation (ASF) under one * + * or more contributor license agreements. See the NOTICE file * + * distributed with this work for additional information * + * regarding copyright ownership. The ASF licenses this file * + * to you under the Apache License, Version 2.0 (the * + * "License"); you may not use this file except in compliance * + * with the License. You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, * + * software distributed under the License is distributed on an * + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * + * KIND, either express or implied. See the License for the * + * specific language governing permissions and limitations * + * under the License. * + ****************************************************************/ + +package org.apache.james.jmap.utils; + +import java.util.Optional; +import java.util.stream.Stream; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class JsoupHtmlTextExtractor implements HtmlTextExtractor { + + private static final Logger LOGGER = LoggerFactory.getLogger(JsoupHtmlTextExtractor.class); + + @Override + public String toPlainText(String html) { + try { + Document document = Jsoup.parse(html); + + Element body = Optional.ofNullable(document.body()).orElse(document); + + return flatten(body) + .map(this::convertNodeToText) + .reduce("", (s1, s2) -> s1 + s2); + } catch (Exception e) { + LOGGER.warn("Failed extracting text from html", e); + return html; + } + } + + private String convertNodeToText(Node node) { + if (node instanceof TextNode) { + TextNode textNode = (TextNode) node; + return textNode.getWholeText(); + } + if (node instanceof Element) { + Element element = (Element) node; + if (element.tagName().equals("br")) { + return "\n"; + } + if (element.tagName().equals("p")) { + return "\n\n"; + } + } + return ""; + } + + Stream<Node> flatten(Node base) { + return Stream.concat( + base.childNodes() + .stream() + .flatMap(this::flatten), + Stream.of(base)); + } + +} http://git-wip-us.apache.org/repos/asf/james-project/blob/84d7a317/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/MailboxBasedHtmlTextExtractor.java ---------------------------------------------------------------------- diff --git a/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/MailboxBasedHtmlTextExtractor.java b/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/MailboxBasedHtmlTextExtractor.java deleted file mode 100644 index 99cb01e..0000000 --- a/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/MailboxBasedHtmlTextExtractor.java +++ /dev/null @@ -1,33 +0,0 @@ - - -package org.apache.james.jmap.utils; - -import java.io.ByteArrayInputStream; - -import javax.inject.Inject; - -import org.apache.james.mailbox.extractor.TextExtractor; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class MailboxBasedHtmlTextExtractor implements HtmlTextExtractor { - - private static final Logger LOGGER = LoggerFactory.getLogger(MailboxBasedHtmlTextExtractor.class); - - private final TextExtractor textExtractor; - - @Inject - public MailboxBasedHtmlTextExtractor(TextExtractor textExtractor) { - this.textExtractor = textExtractor; - } - - @Override - public String toPlainText(String html) { - try { - return textExtractor.extractContent(new ByteArrayInputStream(html.getBytes()), "text/html", "").getTextualContent(); - } catch (Exception e) { - LOGGER.warn("Error extracting text from HTML", e); - return html; - } - } -} http://git-wip-us.apache.org/repos/asf/james-project/blob/84d7a317/server/protocols/jmap/src/test/java/org/apache/james/jmap/methods/GetMessagesMethodTest.java ---------------------------------------------------------------------- diff --git a/server/protocols/jmap/src/test/java/org/apache/james/jmap/methods/GetMessagesMethodTest.java b/server/protocols/jmap/src/test/java/org/apache/james/jmap/methods/GetMessagesMethodTest.java index db400db..7a0ab78 100644 --- a/server/protocols/jmap/src/test/java/org/apache/james/jmap/methods/GetMessagesMethodTest.java +++ b/server/protocols/jmap/src/test/java/org/apache/james/jmap/methods/GetMessagesMethodTest.java @@ -41,7 +41,7 @@ import org.apache.james.jmap.model.MessageFactory; import org.apache.james.jmap.model.MessagePreviewGenerator; import org.apache.james.jmap.model.MessageProperties.MessageProperty; import org.apache.james.jmap.utils.HtmlTextExtractor; -import org.apache.james.jmap.utils.MailboxBasedHtmlTextExtractor; +import org.apache.james.jmap.utils.JsoupHtmlTextExtractor; import org.apache.james.mailbox.MailboxManager; import org.apache.james.mailbox.MailboxSession; import org.apache.james.mailbox.MessageIdManager; @@ -54,7 +54,6 @@ import org.apache.james.mailbox.mock.MockMailboxSession; import org.apache.james.mailbox.model.ComposedMessageId; import org.apache.james.mailbox.model.MailboxId; import org.apache.james.mailbox.model.MailboxPath; -import org.apache.james.mailbox.tika.extractor.TikaTextExtractor; import org.apache.james.metrics.logger.DefaultMetricFactory; import org.apache.james.util.mime.MessageContentExtractor; import org.assertj.core.api.Condition; @@ -116,7 +115,7 @@ public class GetMessagesMethodTest { @Before public void setup() throws Exception { clientId = ClientId.of("#0"); - HtmlTextExtractor htmlTextExtractor = new MailboxBasedHtmlTextExtractor(new TikaTextExtractor()); + HtmlTextExtractor htmlTextExtractor = new JsoupHtmlTextExtractor(); MessagePreviewGenerator messagePreview = new MessagePreviewGenerator(); MessageContentExtractor messageContentExtractor = new MessageContentExtractor(); MessageFactory messageFactory = new MessageFactory(messagePreview, messageContentExtractor, htmlTextExtractor); http://git-wip-us.apache.org/repos/asf/james-project/blob/84d7a317/server/protocols/jmap/src/test/java/org/apache/james/jmap/model/MessageFactoryTest.java ---------------------------------------------------------------------- diff --git a/server/protocols/jmap/src/test/java/org/apache/james/jmap/model/MessageFactoryTest.java b/server/protocols/jmap/src/test/java/org/apache/james/jmap/model/MessageFactoryTest.java index f95998f..cc65cc7 100644 --- a/server/protocols/jmap/src/test/java/org/apache/james/jmap/model/MessageFactoryTest.java +++ b/server/protocols/jmap/src/test/java/org/apache/james/jmap/model/MessageFactoryTest.java @@ -31,14 +31,13 @@ import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.james.jmap.model.MessageFactory.MetaDataWithContent; import org.apache.james.jmap.utils.HtmlTextExtractor; -import org.apache.james.jmap.utils.MailboxBasedHtmlTextExtractor; +import org.apache.james.jmap.utils.JsoupHtmlTextExtractor; import org.apache.james.mailbox.MessageUid; import org.apache.james.mailbox.inmemory.InMemoryId; import org.apache.james.mailbox.model.AttachmentId; import org.apache.james.mailbox.model.Cid; import org.apache.james.mailbox.model.MessageAttachment; import org.apache.james.mailbox.model.TestMessageId; -import org.apache.james.mailbox.tika.extractor.TikaTextExtractor; import org.apache.james.util.mime.MessageContentExtractor; import org.junit.Before; import org.junit.Test; @@ -57,7 +56,7 @@ public class MessageFactoryTest { @Before public void setUp() { - htmlTextExtractor = new MailboxBasedHtmlTextExtractor(new TikaTextExtractor()); + htmlTextExtractor = new JsoupHtmlTextExtractor(); messagePreview = new MessagePreviewGenerator(); MessageContentExtractor messageContentExtractor = new MessageContentExtractor(); http://git-wip-us.apache.org/repos/asf/james-project/blob/84d7a317/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java ---------------------------------------------------------------------- diff --git a/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java b/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java new file mode 100644 index 0000000..28e9d1d --- /dev/null +++ b/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java @@ -0,0 +1,119 @@ +/**************************************************************** + * Licensed to the Apache Software Foundation (ASF) under one * + * or more contributor license agreements. See the NOTICE file * + * distributed with this work for additional information * + * regarding copyright ownership. The ASF licenses this file * + * to you under the Apache License, Version 2.0 (the * + * "License"); you may not use this file except in compliance * + * with the License. You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, * + * software distributed under the License is distributed on an * + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * + * KIND, either express or implied. See the License for the * + * specific language governing permissions and limitations * + * under the License. * + ****************************************************************/ + +package org.apache.james.jmap.utils; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.nio.charset.StandardCharsets; + +import org.apache.commons.io.IOUtils; +import org.junit.Before; +import org.junit.Test; + +public class JsoupHtmlTextExtractorTest { + + private JsoupHtmlTextExtractor textExtractor; + + @Before + public void setUp() { + textExtractor = new JsoupHtmlTextExtractor(); + } + + @Test + public void toPlainTextShouldNotModifyPlainText() { + String textWithoutHtml = "text without html"; + assertThat(textExtractor.toPlainText(textWithoutHtml)).isEqualTo(textWithoutHtml); + } + + @Test + public void toPlainTextShouldRemoveSimpleHtmlTag() { + String html = "This is an <b>HTML</b> text !"; + String expectedPlainText = "This is an HTML text !"; + assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); + } + + @Test + public void toPlainTextShouldReplaceSkipLine() { + String html = "<p>This is an<br/>HTML text !</p>"; + String expectedPlainText = "This is an\nHTML text !\n\n"; + assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); + } + + @Test + public void toPlainTextShouldSkipLinesBetweenParagraph() { + String html = "<p>para1</p><p>para2</p>"; + String expectedPlainText = "para1\n\npara2\n\n"; + assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); + } + + @Test + public void toPlainTextShouldConciderUpperCaseLabelsAsLowerCase() { + String html = "<P>para1</P><p>para2</p>"; + String expectedPlainText = "para1\n\npara2\n\n"; + assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); + } + + @Test + public void nonClosedHtmlShouldBeTranslated() { + String html = "This is an <b>HTML text !"; + String expectedPlainText = "This is an HTML text !"; + assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); + } + + @Test + public void brokenHtmlShouldBeTranslatedUntilTheBrokenBalise() { + String html = "This is an <b>HTML</b missing missing missing !"; + String expectedPlainText = "This is an HTML"; + assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); + } + + @Test + public void toPlainTextShouldWorkWithMoreComplexHTML() throws Exception { + String html = IOUtils.toString(ClassLoader.getSystemResource("example.html"), StandardCharsets.UTF_8); + String expectedPlainText = "\n" + + " Why a new Logo?\n" + + "\n" + + "\n" + + " We are happy with our current logo, but for the\n" + + " upcoming James Server 3.0 release, we would like to\n" + + " give our community the opportunity to create a new image for James.\n" + + "\n" + + "\n" + + "\n" + + "\n" + + " Don't be shy, take your inkscape and gimp, and send us on\n" + + " the James Server User mailing list\n" + + " your creations. We will publish them on this page.\n" + + "\n" + + "\n" + + "\n" + + "\n" + + " We need an horizontal logo (100p height) to be show displayed on the upper\n" + + " left corner of this page, an avatar (48x48p) to be used on a Twitter stream for example.\n" + + " The used fonts should be redistributable (or commonly available on Windows and Linux).\n" + + " The chosen logo should be delivered in SVG format.\n" + + " We also like the Apache feather.\n" + + "\n" + + "\n" + + "\n"; + assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); + } + +} http://git-wip-us.apache.org/repos/asf/james-project/blob/84d7a317/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/MailboxBasedHtmlTextExtractorTest.java ---------------------------------------------------------------------- diff --git a/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/MailboxBasedHtmlTextExtractorTest.java b/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/MailboxBasedHtmlTextExtractorTest.java deleted file mode 100644 index 9a44dde..0000000 --- a/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/MailboxBasedHtmlTextExtractorTest.java +++ /dev/null @@ -1,112 +0,0 @@ -/**************************************************************** - * Licensed to the Apache Software Foundation (ASF) under one * - * or more contributor license agreements. See the NOTICE file * - * distributed with this work for additional information * - * regarding copyright ownership. The ASF licenses this file * - * to you under the Apache License, Version 2.0 (the * - * "License"); you may not use this file except in compliance * - * with the License. You may obtain a copy of the License at * - * * - * http://www.apache.org/licenses/LICENSE-2.0 * - * * - * Unless required by applicable law or agreed to in writing, * - * software distributed under the License is distributed on an * - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * - * KIND, either express or implied. See the License for the * - * specific language governing permissions and limitations * - * under the License. * - ****************************************************************/ - -package org.apache.james.jmap.utils; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.nio.charset.StandardCharsets; - -import org.apache.commons.io.IOUtils; -import org.apache.james.mailbox.tika.extractor.TikaTextExtractor; -import org.junit.Before; -import org.junit.Test; - -public class MailboxBasedHtmlTextExtractorTest { - - private MailboxBasedHtmlTextExtractor textExtractor; - - @Before - public void setUp() { - textExtractor = new MailboxBasedHtmlTextExtractor(new TikaTextExtractor()); - } - - @Test - public void toPlainTextShouldNotModifyPlainText() { - String textWithoutHtml = "text without html"; - assertThat(textExtractor.toPlainText(textWithoutHtml)).isEqualTo(textWithoutHtml); - } - - @Test - public void toPlainTextShouldRemoveSimpleHtmlTag() { - String html = "This is an <b>HTML</b> text !"; - String expectedPlainText = "This is an HTML text !"; - assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); - } - - @Test - public void toPlainTextShouldReplaceSkipLine() { - String html = "<p>This is an<br/>HTML text !</p>"; - String expectedPlainText = "This is an\nHTML text !\n"; - assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); - } - - @Test - public void toPlainTextShouldSkipLinesBetweenParagraph() { - String html = "<p>para1</p><p>para2</p>"; - String expectedPlainText = "para1\npara2\n"; - assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); - } - - @Test - public void nonClosedHtmlShouldBeTranslated() { - String html = "This is an <b>HTML text !"; - String expectedPlainText = "This is an HTML text !"; - assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); - } - - @Test - public void brokenHtmlShouldBeTranslatedUntilTheBrokenBalise() { - String html = "This is an <b>HTML</b missing missing missing !"; - String expectedPlainText = "This is an HTML"; - assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); - } - - @Test - public void toPlainTextShouldWorkWithMoreComplexHTML() throws Exception { - String html = IOUtils.toString(ClassLoader.getSystemResource("example.html"), StandardCharsets.UTF_8); - String expectedPlainText = "\n" + - " Why a new Logo?\n" + - "\n" + - "\n" + - "\n" + - " We are happy with our current logo, but for the\n" + - " upcoming James Server 3.0 release, we would like to\n" + - " give our community the opportunity to create a new image for James.\n" + - "\n" + - "\n" + - "\n" + - " Don't be shy, take your inkscape and gimp, and send us on\n" + - " the James Server User mailing list\n" + - " your creations. We will publish them on this page.\n" + - "\n" + - "\n" + - "\n" + - " We need an horizontal logo (100p height) to be show displayed on the upper\n" + - " left corner of this page, an avatar (48x48p) to be used on a Twitter stream for example.\n" + - " The used fonts should be redistributable (or commonly available on Windows and Linux).\n" + - " The chosen logo should be delivered in SVG format.\n" + - " We also like the Apache feather.\n" + - "\n" + - "\n" + - "\n"; - assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); - } - -} --------------------------------------------------------------------- To unsubscribe, e-mail: server-dev-unsubscr...@james.apache.org For additional commands, e-mail: server-dev-h...@james.apache.org