This is an automated email from the ASF dual-hosted git repository. btellier pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/james-project.git
The following commit(s) were added to refs/heads/master by this push: new a6f9f32d8c JAMES-4100 Improve Search Snippet display (#2583) a6f9f32d8c is described below commit a6f9f32d8c0a63f262b62391d3ec1d15510ad29b Author: hungphan227 <45198168+hungphan...@users.noreply.github.com> AuthorDate: Wed Jan 22 15:18:30 2025 +0700 JAMES-4100 Improve Search Snippet display (#2583) Co-authored-by: hung phan <hp...@linagora.com> --- .../searchhighligt/SearchHighLighterContract.java | 39 ++++++++++++++++++++++ .../lucene/search/LuceneIndexableDocument.java | 2 +- .../lucene/search/LuceneSearchHighlighter.java | 2 ++ .../mailbox/opensearch/json/IndexableMessage.java | 2 +- .../james/mailbox/store/search/SearchUtil.java | 17 ++++++++++ .../contract/SearchSnippetGetMethodContract.scala | 6 ++-- 6 files changed, 63 insertions(+), 5 deletions(-) diff --git a/mailbox/api/src/test/java/org/apache/james/mailbox/searchhighligt/SearchHighLighterContract.java b/mailbox/api/src/test/java/org/apache/james/mailbox/searchhighligt/SearchHighLighterContract.java index 2181140be9..998b8f2ac2 100644 --- a/mailbox/api/src/test/java/org/apache/james/mailbox/searchhighligt/SearchHighLighterContract.java +++ b/mailbox/api/src/test/java/org/apache/james/mailbox/searchhighligt/SearchHighLighterContract.java @@ -532,4 +532,43 @@ public interface SearchHighLighterContract { softly.assertThat(searchSnippets.getFirst().highlightedBody().get()).contains("<mark>barcamp</mark>"); }); } + + @Test + default void highlightSearchShouldShortenGreaterThanCharacters() throws Exception { + MailboxSession session = session(USERNAME1); + + // Given m1,m2 with m1 has body containing the searched word (contentA) + ComposedMessageId m1 = appendMessage(MessageManager.AppendCommand.from( + Message.Builder.of() + .setTo("to@james.local") + .setSubject("Hallo, Thx Matthieu for your help") + .setBody("Start \n>>>>>>>>>> append contentA to > inbox \n>>>>>> End", + StandardCharsets.UTF_8)), + session).getId(); + + ComposedMessageId m2 = appendMessage(MessageManager.AppendCommand.from( + Message.Builder.of() + .setTo("to@james.local") + .setSubject("Hallo, Thx Alex for your help") + .setBody("append contentB to inbox", StandardCharsets.UTF_8)), + session).getId(); + + verifyMessageWasIndexed(2); + + // When searching for the word (contentA) in the body + MultimailboxesSearchQuery multiMailboxSearch = MultimailboxesSearchQuery.from(SearchQuery.of( + SearchQuery.bodyContains("contentA"))) + .inMailboxes(List.of(m1.getMailboxId(), m2.getMailboxId())) + .build(); + + // Then highlightSearch should return the SearchSnippet with the highlightedBody containing the word (contentA) + List<SearchSnippet> searchSnippets = Flux.from(testee().highlightSearch(List.of(m1.getMessageId(), m2.getMessageId()), multiMailboxSearch, session)) + .collectList() + .block(); + assertThat(searchSnippets).hasSize(1); + assertSoftly(softly -> { + softly.assertThat(searchSnippets.getFirst().messageId()).isEqualTo(m1.getMessageId()); + softly.assertThat(searchSnippets.getFirst().highlightedBody().get()).isEqualTo("Start \n append <mark>contentA</mark> to > inbox \n End"); + }); + } } diff --git a/mailbox/lucene/src/main/java/org/apache/james/mailbox/lucene/search/LuceneIndexableDocument.java b/mailbox/lucene/src/main/java/org/apache/james/mailbox/lucene/search/LuceneIndexableDocument.java index bee20308d2..4f14d14c3f 100644 --- a/mailbox/lucene/src/main/java/org/apache/james/mailbox/lucene/search/LuceneIndexableDocument.java +++ b/mailbox/lucene/src/main/java/org/apache/james/mailbox/lucene/search/LuceneIndexableDocument.java @@ -195,7 +195,7 @@ public class LuceneIndexableDocument { doc.add(new TextField(BCC_FIELD, uppercase(EMailers.from(headerCollection.getBccAddressSet()).serialize()), Field.Store.YES)); // index body - Optional<String> bodyText = mimePartExtracted.locateFirstTextBody(); + Optional<String> bodyText = mimePartExtracted.locateFirstTextBody().map(SearchUtil::removeGreaterThanCharactersAtBeginningOfLine); Optional<String> bodyHtml = mimePartExtracted.locateFirstHtmlBody(); bodyText.or(() -> bodyHtml) diff --git a/mailbox/lucene/src/main/java/org/apache/james/mailbox/lucene/search/LuceneSearchHighlighter.java b/mailbox/lucene/src/main/java/org/apache/james/mailbox/lucene/search/LuceneSearchHighlighter.java index c398865df8..8a25d922c4 100644 --- a/mailbox/lucene/src/main/java/org/apache/james/mailbox/lucene/search/LuceneSearchHighlighter.java +++ b/mailbox/lucene/src/main/java/org/apache/james/mailbox/lucene/search/LuceneSearchHighlighter.java @@ -57,6 +57,7 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.highlight.Formatter; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; +import org.apache.lucene.search.highlight.SimpleHTMLEncoder; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.search.highlight.SimpleSpanFragmenter; @@ -126,6 +127,7 @@ public class LuceneSearchHighlighter implements SearchHighlighter { Query query = buildQueryFromSearchQuery(searchQuery); QueryScorer scorer = new QueryScorer(query); Highlighter highlighter = new Highlighter(formatter, scorer); + highlighter.setEncoder(new SimpleHTMLEncoder()); highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, configuration.fragmentSize())); return highlighter; } diff --git a/mailbox/opensearch/src/main/java/org/apache/james/mailbox/opensearch/json/IndexableMessage.java b/mailbox/opensearch/src/main/java/org/apache/james/mailbox/opensearch/json/IndexableMessage.java index 722d8b8988..8bfaf646f8 100644 --- a/mailbox/opensearch/src/main/java/org/apache/james/mailbox/opensearch/json/IndexableMessage.java +++ b/mailbox/opensearch/src/main/java/org/apache/james/mailbox/opensearch/json/IndexableMessage.java @@ -135,7 +135,7 @@ public class IndexableMessage { .asMimePart(textExtractor) .map(parsingResult -> { - Optional<String> bodyText = parsingResult.locateFirstTextBody(); + Optional<String> bodyText = parsingResult.locateFirstTextBody().map(SearchUtil::removeGreaterThanCharactersAtBeginningOfLine); Optional<String> bodyHtml = parsingResult.locateFirstHtmlBody(); boolean hasAttachment = MessageAttachmentMetadata.hasNonInlinedAttachment(message.getAttachments()); diff --git a/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/SearchUtil.java b/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/SearchUtil.java index 8c7686f60c..0a90b132b6 100644 --- a/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/SearchUtil.java +++ b/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/SearchUtil.java @@ -473,5 +473,22 @@ public class SearchUtil { }; } + public static String removeGreaterThanCharactersAtBeginningOfLine(String text) { + StringBuilder result = new StringBuilder(); + boolean isNewLine = false; + for (int i = 0; i < text.length(); i++) { + char current = text.charAt(i); + + if (current == '\n') { + isNewLine = true; + result.append(current); + } else if (!isNewLine || current != '>') { + result.append(current); + isNewLine = false; + } + } + + return result.toString(); + } } diff --git a/server/protocols/jmap-rfc-8621-integration-tests/jmap-rfc-8621-integration-tests-common/src/main/scala/org/apache/james/jmap/rfc8621/contract/SearchSnippetGetMethodContract.scala b/server/protocols/jmap-rfc-8621-integration-tests/jmap-rfc-8621-integration-tests-common/src/main/scala/org/apache/james/jmap/rfc8621/contract/SearchSnippetGetMethodContract.scala index 7246e6a80e..17cd4d3cf9 100644 --- a/server/protocols/jmap-rfc-8621-integration-tests/jmap-rfc-8621-integration-tests-common/src/main/scala/org/apache/james/jmap/rfc8621/contract/SearchSnippetGetMethodContract.scala +++ b/server/protocols/jmap-rfc-8621-integration-tests/jmap-rfc-8621-integration-tests-common/src/main/scala/org/apache/james/jmap/rfc8621/contract/SearchSnippetGetMethodContract.scala @@ -529,12 +529,12 @@ trait SearchSnippetGetMethodContract { | "list": [ | { | "emailId": "${messageId1.serialize}", - | "subject": "Weekly report - <mark>vttran</mark> 27/02-03/03/2023", + | "subject": "Weekly report - <mark>vttran</mark> 27/02-03/03/2023", | "preview": null | }, | { | "emailId": "${messageId2.serialize}", - | "subject": "Weekly report - <mark>vttran</mark> 19/08-23/08/2024", + | "subject": "Weekly report - <mark>vttran</mark> 19/08-23/08/2024", | "preview": null | } | ], @@ -610,7 +610,7 @@ trait SearchSnippetGetMethodContract { | "list": [ | { | "emailId": "${messageId1.serialize}", - | "subject": "Weekly report - <mark>vttran</mark> 27/02-03/03/2023", + | "subject": "Weekly report - <mark>vttran</mark> 27/02-03/03/2023", | "preview": null | }, | { --------------------------------------------------------------------- To unsubscribe, e-mail: notifications-unsubscr...@james.apache.org For additional commands, e-mail: notifications-h...@james.apache.org