This is an automated email from the ASF dual-hosted git repository. kwin pushed a commit to branch feature/markup-linebreaks in repository https://gitbox.apache.org/repos/asf/maven-doxia.git
commit e8a70ba1f2464ba44723e5679d90ef4c7e3c296c Author: Konrad Windszus <[email protected]> AuthorDate: Thu Feb 26 09:14:21 2026 +0100 Distinguish between linebreaks for formatting markup and linebreaks in output Add new Sink method "markupLineBreak" for insignificant linebreaks. XhtmlParser detects insignificant linebreaks according to https://developer.mozilla.org/en-US/docs/Web/CSS/Guides/Text/Whitespace and emits them accordingly. It also collapses whitespaces. It assumes no CSS overrides for boxed/inline elements and for "white-space-collapse" This closes #882 --- .../maven/doxia/parser/Xhtml5BaseParser.java | 812 +++++++++++++-------- .../apache/maven/doxia/sink/impl/SinkWrapper.java | 4 +- .../maven/doxia/sink/impl/Xhtml5BaseSink.java | 21 +- .../maven/doxia/parser/Xhtml5BaseParserTest.java | 29 +- .../maven/doxia/sink/impl/AbstractSinkTest.java | 6 +- .../doxia/sink/impl/SinkEventTestingSink.java | 8 +- .../apache/maven/doxia/module/apt/AptParser.java | 4 +- .../org/apache/maven/doxia/module/apt/AptSink.java | 5 - .../maven/doxia/module/apt/AptParserTest.java | 6 +- .../apache/maven/doxia/module/apt/AptSinkTest.java | 4 +- .../maven/doxia/module/markdown/MarkdownSink.java | 81 +- .../doxia/module/markdown/MarkdownParserTest.java | 28 +- .../doxia/module/markdown/MarkdownSinkTest.java | 47 +- .../src/test/resources/table.html | 31 + .../apache/maven/doxia/module/xdoc/XdocParser.java | 4 +- .../maven/doxia/module/xdoc/XdocParserTest.java | 24 + .../java/org/apache/maven/doxia/sink/Sink.java | 12 +- pom.xml | 1 + 18 files changed, 726 insertions(+), 401 deletions(-) diff --git a/doxia-core/src/main/java/org/apache/maven/doxia/parser/Xhtml5BaseParser.java b/doxia-core/src/main/java/org/apache/maven/doxia/parser/Xhtml5BaseParser.java index a2b03c81..4d44c971 100644 --- a/doxia-core/src/main/java/org/apache/maven/doxia/parser/Xhtml5BaseParser.java +++ b/doxia-core/src/main/java/org/apache/maven/doxia/parser/Xhtml5BaseParser.java @@ -21,6 +21,8 @@ package org.apache.maven.doxia.parser; import javax.swing.text.html.HTML.Attribute; import java.io.Reader; +import java.text.CharacterIterator; +import java.text.StringCharacterIterator; import java.util.HashSet; import java.util.LinkedList; import java.util.Set; @@ -109,6 +111,9 @@ public class Xhtml5BaseParser extends AbstractXmlParser implements HtmlMarkup { /** Used to distinguish <a href=""> from <a name="">. */ private boolean isLink; + /** If true, the next text event is at the beginning of a line inside a block element, i.e. after a block tag or a line break/end block tag. */ + protected boolean isBeginningOfLineInsideBlock = true; + /** Used to distinguish <a href=""> from <a name="">. */ private boolean isAnchor; @@ -192,181 +197,274 @@ public class Xhtml5BaseParser extends AbstractXmlParser implements HtmlMarkup { protected boolean baseStartTag(String elementName, SinkEventAttributeSet attribs, Sink sink) { boolean visited = true; - - if (elementName.equals(HtmlMarkup.ARTICLE.toString())) { - sink.article(attribs); - } else if (elementName.equals(HtmlMarkup.NAV.toString())) { - sink.navigation(attribs); - } else if (elementName.equals(HtmlMarkup.ASIDE.toString())) { - sink.sidebar(attribs); - } else if (elementName.equals(HtmlMarkup.SECTION.toString())) { - handleSectionStart(sink, attribs); - } else if (elementName.equals(HtmlMarkup.H1.toString())) { - handleHeadingStart(sink, Sink.SECTION_LEVEL_1, attribs); - } else if (elementName.equals(HtmlMarkup.H2.toString())) { - handleHeadingStart(sink, Sink.SECTION_LEVEL_2, attribs); - } else if (elementName.equals(HtmlMarkup.H3.toString())) { - handleHeadingStart(sink, Sink.SECTION_LEVEL_3, attribs); - } else if (elementName.equals(HtmlMarkup.H4.toString())) { - handleHeadingStart(sink, Sink.SECTION_LEVEL_4, attribs); - } else if (elementName.equals(HtmlMarkup.H5.toString())) { - handleHeadingStart(sink, Sink.SECTION_LEVEL_5, attribs); - } else if (elementName.equals(HtmlMarkup.H6.toString())) { - handleHeadingStart(sink, Sink.SECTION_LEVEL_6, attribs); - } else if (elementName.equals(HtmlMarkup.HEADER.toString())) { - sink.header(attribs); - } else if (elementName.equals(HtmlMarkup.MAIN.toString())) { - sink.content(attribs); - } else if (elementName.equals(HtmlMarkup.FOOTER.toString())) { - sink.footer(attribs); - } else if (elementName.equals(HtmlMarkup.EM.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.EMPHASIS); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.STRONG.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.STRONG); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.SMALL.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.SMALL); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.S.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.LINE_THROUGH); - sink.inline(attribs); - /* deprecated line-through support */ - } else if (elementName.equals(HtmlMarkup.CITE.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.CITATION); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.Q.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.QUOTE); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.DFN.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.DEFINITION); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.ABBR.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.ABBREVIATION); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.I.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.ITALIC); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.B.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.BOLD); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.CODE.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.CODE); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.VAR.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.VARIABLE); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.SAMP.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.SAMPLE); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.KBD.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.KEYBOARD); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.SUP.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.SUPERSCRIPT); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.SUB.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.SUBSCRIPT); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.U.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.ANNOTATION); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.MARK.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.HIGHLIGHT); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.RUBY.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.RB.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_BASE); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.RT.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_TEXT); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.RTC.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_TEXT_CONTAINER); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.RP.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_PARANTHESES); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.BDI.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.BIDIRECTIONAL_ISOLATION); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.BDO.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.BIDIRECTIONAL_OVERRIDE); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.SPAN.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.PHRASE); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.INS.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.INSERT); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.DEL.toString())) { - attribs.addAttributes(SinkEventAttributeSet.Semantics.DELETE); - sink.inline(attribs); - } else if (elementName.equals(HtmlMarkup.P.toString())) { - handlePStart(sink, attribs); - } else if (elementName.equals(HtmlMarkup.DIV.toString())) { - handleDivStart(attribs, sink); - } else if (elementName.equals(HtmlMarkup.PRE.toString())) { - handlePreStart(attribs, sink); - } else if (elementName.equals(HtmlMarkup.UL.toString())) { - sink.list(attribs); - } else if (elementName.equals(HtmlMarkup.OL.toString())) { - handleOLStart(sink, attribs); - } else if (elementName.equals(HtmlMarkup.LI.toString())) { - handleLIStart(sink, attribs); - } else if (elementName.equals(HtmlMarkup.DL.toString())) { - sink.definitionList(attribs); - } else if (elementName.equals(HtmlMarkup.DT.toString())) { - if (hasDefinitionListItem) { - // close previous listItem - sink.definitionListItem_(); - } - sink.definitionListItem(attribs); - hasDefinitionListItem = true; - sink.definedTerm(attribs); - } else if (elementName.equals(HtmlMarkup.DD.toString())) { - if (!hasDefinitionListItem) { + isBeginningOfLineInsideBlock = true; + switch (elementName) { + case "article": + sink.article(attribs); + break; + case "nav": + sink.navigation(attribs); + break; + case "aside": + sink.sidebar(attribs); + break; + case "section": + handleSectionStart(sink, attribs); + break; + case "h1": + handleHeadingStart(sink, Sink.SECTION_LEVEL_1, attribs); + break; + case "h2": + handleHeadingStart(sink, Sink.SECTION_LEVEL_2, attribs); + break; + case "h3": + handleHeadingStart(sink, Sink.SECTION_LEVEL_3, attribs); + break; + case "h4": + handleHeadingStart(sink, Sink.SECTION_LEVEL_4, attribs); + break; + case "h5": + handleHeadingStart(sink, Sink.SECTION_LEVEL_5, attribs); + break; + case "h6": + handleHeadingStart(sink, Sink.SECTION_LEVEL_6, attribs); + break; + case "header": + sink.header(attribs); + break; + case "main": + sink.content(attribs); + break; + case "footer": + sink.footer(attribs); + break; + case "em": + attribs.addAttributes(SinkEventAttributeSet.Semantics.EMPHASIS); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "strong": + attribs.addAttributes(SinkEventAttributeSet.Semantics.STRONG); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "small": + attribs.addAttributes(SinkEventAttributeSet.Semantics.SMALL); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "s": + /* deprecated line-through support */ + attribs.addAttributes(SinkEventAttributeSet.Semantics.LINE_THROUGH); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "cite": + attribs.addAttributes(SinkEventAttributeSet.Semantics.CITATION); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "q": + attribs.addAttributes(SinkEventAttributeSet.Semantics.QUOTE); + sink.inline(attribs); + break; + case "dfn": + attribs.addAttributes(SinkEventAttributeSet.Semantics.DEFINITION); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "abbr": + attribs.addAttributes(SinkEventAttributeSet.Semantics.ABBREVIATION); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "i": + attribs.addAttributes(SinkEventAttributeSet.Semantics.ITALIC); + sink.inline(attribs); + break; + case "b": + attribs.addAttributes(SinkEventAttributeSet.Semantics.BOLD); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "code": + attribs.addAttributes(SinkEventAttributeSet.Semantics.CODE); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "var": + attribs.addAttributes(SinkEventAttributeSet.Semantics.VARIABLE); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "samp": + attribs.addAttributes(SinkEventAttributeSet.Semantics.SAMPLE); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "kbd": + attribs.addAttributes(SinkEventAttributeSet.Semantics.KEYBOARD); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "sup": + attribs.addAttributes(SinkEventAttributeSet.Semantics.SUPERSCRIPT); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "sub": + attribs.addAttributes(SinkEventAttributeSet.Semantics.SUBSCRIPT); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "u": + attribs.addAttributes(SinkEventAttributeSet.Semantics.ANNOTATION); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "mark": + attribs.addAttributes(SinkEventAttributeSet.Semantics.HIGHLIGHT); + sink.inline(attribs); + break; + case "ruby": + attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "rb": + attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_BASE); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "rt": + attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_TEXT); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "rtc": + attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_TEXT_CONTAINER); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "rp": + attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_PARANTHESES); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "bdi": + attribs.addAttributes(SinkEventAttributeSet.Semantics.BIDIRECTIONAL_ISOLATION); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "bdo": + attribs.addAttributes(SinkEventAttributeSet.Semantics.BIDIRECTIONAL_OVERRIDE); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "span": + attribs.addAttributes(SinkEventAttributeSet.Semantics.PHRASE); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "ins": + attribs.addAttributes(SinkEventAttributeSet.Semantics.INSERT); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "del": + attribs.addAttributes(SinkEventAttributeSet.Semantics.DELETE); + sink.inline(attribs); + isBeginningOfLineInsideBlock = false; + break; + case "p": + handlePStart(sink, attribs); + break; + case "div": + handleDivStart(attribs, sink); + break; + case "pre": + handlePreStart(attribs, sink); + break; + case "ul": + sink.list(attribs); + break; + case "ol": + handleOLStart(sink, attribs); + break; + case "li": + handleLIStart(sink, attribs); + break; + case "dl": + sink.definitionList(attribs); + break; + case "dt": + if (hasDefinitionListItem) { + // close previous listItem + sink.definitionListItem_(); + } sink.definitionListItem(attribs); - } - sink.definition(attribs); - } else if (elementName.equals(HtmlMarkup.FIGURE.toString())) { - sink.figure(attribs); - } else if (elementName.equals(HtmlMarkup.FIGCAPTION.toString())) { - sink.figureCaption(attribs); - } else if (elementName.equals(HtmlMarkup.A.toString())) { - handleAStart(sink, attribs); - } else if (elementName.equals(HtmlMarkup.TABLE.toString())) { - handleTableStart(sink, attribs); - } else if (elementName.equals(HtmlMarkup.TR.toString())) { - sink.tableRow(attribs); - } else if (elementName.equals(HtmlMarkup.TH.toString())) { - sink.tableHeaderCell(attribs); - } else if (elementName.equals(HtmlMarkup.TD.toString())) { - sink.tableCell(attribs); - } else if (elementName.equals(HtmlMarkup.CAPTION.toString())) { - sink.tableCaption(attribs); - } else if (elementName.equals(HtmlMarkup.BR.toString())) { - sink.lineBreak(attribs); - } else if (elementName.equals(HtmlMarkup.WBR.toString())) { - sink.lineBreakOpportunity(attribs); - } else if (elementName.equals(HtmlMarkup.HR.toString())) { - sink.horizontalRule(attribs); - } else if (elementName.equals(HtmlMarkup.IMG.toString())) { - handleImgStart(sink, attribs); - } else if (elementName.equals(HtmlMarkup.BLOCKQUOTE.toString())) { - sink.blockquote(attribs); - } else if (UNMATCHED_XHTML5_ELEMENTS.contains(elementName)) { - handleUnknown(elementName, attribs, sink, TAG_TYPE_START); - } else if (UNMATCHED_XHTML5_SIMPLE_ELEMENTS.contains(elementName)) { - handleUnknown(elementName, attribs, sink, TAG_TYPE_SIMPLE); - } else if (elementName.equals(HtmlMarkup.SCRIPT.toString()) - || elementName.equals(HtmlMarkup.STYLE.toString())) { - handleUnknown(elementName, attribs, sink, TAG_TYPE_START); - scriptBlock = true; - } else { - visited = false; + hasDefinitionListItem = true; + sink.definedTerm(attribs); + break; + case "dd": + if (!hasDefinitionListItem) { + sink.definitionListItem(attribs); + } + sink.definition(attribs); + break; + case "figure": + sink.figure(attribs); + break; + case "figcaption": + sink.figureCaption(attribs); + break; + case "a": + handleAStart(sink, attribs); + break; + case "table": + handleTableStart(sink, attribs); + break; + case "tr": + sink.tableRow(attribs); + break; + case "th": + sink.tableHeaderCell(attribs); + break; + case "td": + sink.tableCell(attribs); + break; + case "caption": + sink.tableCaption(attribs); + break; + case "br": + sink.lineBreak(attribs); + break; + case "wbr": + sink.lineBreakOpportunity(attribs); + break; + case "hr": + sink.horizontalRule(attribs); + break; + case "img": + handleImgStart(sink, attribs); + break; + case "blockquote": + sink.blockquote(attribs); + break; + case "script": + case "style": + handleUnknown(elementName, attribs, sink, TAG_TYPE_START); + scriptBlock = true; + break; + default: + if (UNMATCHED_XHTML5_ELEMENTS.contains(elementName)) { + handleUnknown(elementName, attribs, sink, TAG_TYPE_START); + } else if (UNMATCHED_XHTML5_SIMPLE_ELEMENTS.contains(elementName)) { + handleUnknown(elementName, attribs, sink, TAG_TYPE_SIMPLE); + } else { + visited = false; + } + break; } return visited; @@ -391,150 +489,159 @@ public class Xhtml5BaseParser extends AbstractXmlParser implements HtmlMarkup { protected boolean baseEndTag(String elementName, SinkEventAttributeSet attribs, Sink sink) { boolean visited = true; - - if (elementName.equals(HtmlMarkup.P.toString())) { - sink.paragraph_(); - } else if (elementName.equals(HtmlMarkup.DIV.toString())) { - handleDivEnd(sink); - } else if (elementName.equals(HtmlMarkup.PRE.toString())) { - verbatim_(); - - sink.verbatim_(); - } else if (elementName.equals(HtmlMarkup.UL.toString())) { - sink.list_(); - } else if (elementName.equals(HtmlMarkup.OL.toString())) { - sink.numberedList_(); - orderedListDepth--; - } else if (elementName.equals(HtmlMarkup.LI.toString())) { - handleListItemEnd(sink); - } else if (elementName.equals(HtmlMarkup.DL.toString())) { - if (hasDefinitionListItem) { + isBeginningOfLineInsideBlock = true; + + switch (elementName) { + case "p": + sink.paragraph_(); + break; + case "div": + handleDivEnd(sink); + break; + case "pre": + verbatim_(); + sink.verbatim_(); + break; + case "ul": + sink.list_(); + break; + case "ol": + sink.numberedList_(); + orderedListDepth--; + break; + case "li": + handleListItemEnd(sink); + break; + case "dl": + if (hasDefinitionListItem) { + sink.definitionListItem_(); + hasDefinitionListItem = false; + } + sink.definitionList_(); + break; + case "dt": + sink.definedTerm_(); + break; + case "dd": + sink.definition_(); sink.definitionListItem_(); hasDefinitionListItem = false; - } - sink.definitionList_(); - } else if (elementName.equals(HtmlMarkup.DT.toString())) { - sink.definedTerm_(); - } else if (elementName.equals(HtmlMarkup.DD.toString())) { - sink.definition_(); - sink.definitionListItem_(); - hasDefinitionListItem = false; - } else if (elementName.equals(HtmlMarkup.FIGURE.toString())) { - sink.figure_(); - } else if (elementName.equals(HtmlMarkup.FIGCAPTION.toString())) { - sink.figureCaption_(); - } else if (elementName.equals(HtmlMarkup.A.toString())) { - handleAEnd(sink); - } else if (elementName.equals(HtmlMarkup.EM.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.STRONG.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.SMALL.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.S.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.CITE.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.Q.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.DFN.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.ABBR.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.I.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.B.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.CODE.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.VAR.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.SAMP.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.KBD.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.SUP.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.SUB.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.U.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.MARK.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.RUBY.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.RB.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.RT.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.RTC.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.RP.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.BDI.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.BDO.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.SPAN.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.INS.toString())) { - sink.inline_(); - } else if (elementName.equals(HtmlMarkup.DEL.toString())) { - sink.inline_(); - } - - // ---------------------------------------------------------------------- - // Tables - // ---------------------------------------------------------------------- - - else if (elementName.equals(HtmlMarkup.TABLE.toString())) { - sink.tableRows_(); - sink.table_(); - } else if (elementName.equals(HtmlMarkup.TR.toString())) { - sink.tableRow_(); - } else if (elementName.equals(HtmlMarkup.TH.toString())) { - sink.tableHeaderCell_(); - } else if (elementName.equals(HtmlMarkup.TD.toString())) { - sink.tableCell_(); - } else if (elementName.equals(HtmlMarkup.CAPTION.toString())) { - sink.tableCaption_(); - } else if (elementName.equals(HtmlMarkup.ARTICLE.toString())) { - sink.article_(); - } else if (elementName.equals(HtmlMarkup.NAV.toString())) { - sink.navigation_(); - } else if (elementName.equals(HtmlMarkup.ASIDE.toString())) { - sink.sidebar_(); - } else if (elementName.equals(HtmlMarkup.SECTION.toString())) { - handleSectionEnd(sink); - } else if (elementName.equals(HtmlMarkup.H1.toString())) { - sink.sectionTitle1_(); - } else if (elementName.equals(HtmlMarkup.H2.toString())) { - sink.sectionTitle2_(); - } else if (elementName.equals(HtmlMarkup.H3.toString())) { - sink.sectionTitle3_(); - } else if (elementName.equals(HtmlMarkup.H4.toString())) { - sink.sectionTitle4_(); - } else if (elementName.equals(HtmlMarkup.H5.toString())) { - sink.sectionTitle5_(); - } else if (elementName.equals(HtmlMarkup.H6.toString())) { - sink.sectionTitle6_(); - } else if (elementName.equals(HtmlMarkup.HEADER.toString())) { - sink.header_(); - } else if (elementName.equals(HtmlMarkup.MAIN.toString())) { - sink.content_(); - } else if (elementName.equals(HtmlMarkup.FOOTER.toString())) { - sink.footer_(); - } else if (elementName.equals(HtmlMarkup.BLOCKQUOTE.toString())) { - sink.blockquote_(); - } else if (UNMATCHED_XHTML5_ELEMENTS.contains(elementName)) { - handleUnknown(elementName, attribs, sink, TAG_TYPE_END); - } else if (elementName.equals(HtmlMarkup.SCRIPT.toString()) - || elementName.equals(HtmlMarkup.STYLE.toString())) { - handleUnknown(elementName, attribs, sink, TAG_TYPE_END); - - scriptBlock = false; - } else { - visited = false; + break; + case "figure": + sink.figure_(); + break; + case "figcaption": + sink.figureCaption_(); + break; + case "a": + handleAEnd(sink); + break; + case "em": + case "strong": + case "small": + case "s": + case "cite": + case "q": + case "dfn": + case "abbr": + case "i": + case "b": + case "code": + case "var": + case "samp": + case "kbd": + case "sup": + case "sub": + case "u": + case "mark": + case "ruby": + case "rb": + case "rt": + case "rtc": + case "rp": + case "bdi": + case "bdo": + case "span": + case "ins": + case "del": + sink.inline_(); + isBeginningOfLineInsideBlock = false; + break; + + // ---------------------------------------------------------------------- + // Tables + // ---------------------------------------------------------------------- + + case "table": + sink.tableRows_(); + sink.table_(); + break; + case "tr": + sink.tableRow_(); + break; + case "th": + sink.tableHeaderCell_(); + break; + case "td": + sink.tableCell_(); + break; + case "caption": + sink.tableCaption_(); + break; + case "article": + sink.article_(); + break; + case "nav": + sink.navigation_(); + break; + case "aside": + sink.sidebar_(); + break; + case "section": + handleSectionEnd(sink); + break; + case "h1": + sink.sectionTitle1_(); + break; + case "h2": + sink.sectionTitle2_(); + break; + case "h3": + sink.sectionTitle3_(); + break; + case "h4": + sink.sectionTitle4_(); + break; + case "h5": + sink.sectionTitle5_(); + break; + case "h6": + sink.sectionTitle6_(); + break; + case "header": + sink.header_(); + break; + case "main": + sink.content_(); + break; + case "footer": + sink.footer_(); + break; + case "blockquote": + sink.blockquote_(); + break; + case "script": + case "style": + handleUnknown(elementName, attribs, sink, TAG_TYPE_END); + scriptBlock = false; + break; + default: + if (UNMATCHED_XHTML5_ELEMENTS.contains(elementName)) { + handleUnknown(elementName, attribs, sink, TAG_TYPE_END); + } else { + visited = false; + } + break; } return visited; @@ -574,19 +681,79 @@ public class Xhtml5BaseParser extends AbstractXmlParser implements HtmlMarkup { protected void handleText(XmlPullParser parser, Sink sink) throws XmlPullParserException { String text = getText(parser); - /* - * NOTE: Don't do any whitespace trimming here. Whitespace normalization has already been performed by the - * parser so any whitespace that makes it here is significant. - * - * NOTE: text within script tags is ignored, scripting code should be embedded in CDATA. - */ + if (!inVerbatim && text != null) { + // do special whitespace processing as outlined in + // https://developer.mozilla.org/en-US/docs/Web/CSS/Guides/Text/Whitespace + if (isBeginningOfLineInsideBlock) { + // normalize linebreaks + processInsignificantLineBreaks(sink, text); + // trim leading whitespace from text being emitted + // https://developer.mozilla.org/en-US/docs/Web/CSS/Guides/Text/Whitespace#trimming_and_positioning + String regex = "^\\s+"; + text = text.replaceAll(regex, ""); + } + + // assume white-space-collapse: collapse for all non-verbatim text (outside of <pre>) + text = collapseWhitespace(text); + } if ((text != null && !text.isEmpty()) && !isScriptBlock()) { sink.text(text); + isBeginningOfLineInsideBlock = false; } } + /** + * Process all line-breaks in the given text which are not significant for the output, i.e. all line-breaks which are not within a verbatim block and + * are at the beginning of the given text. + * In addition it emits information about the whitespace characters following the line-breaks as they may be relevant for the output (e.g. for indentation). + * + * @param sink the sink to receive the events. + * @param text the text to process. + */ + protected void processInsignificantLineBreaks(Sink sink, String text) { + CharacterIterator it = new StringCharacterIterator(text.replaceAll("\\r\\n?", "\n")); + + boolean wasNewLine = false; + int indentLevel = 0; + // + while (it.current() != CharacterIterator.DONE) { + char c = it.current(); + if (c == '\n') { + if (wasNewLine) { + sink.markupLineBreak(indentLevel); + } + indentLevel = 0; + wasNewLine = true; + } else if (Character.isWhitespace(c)) { + indentLevel++; + } else { + // once non-whitespace character is reached we assume everything following is relevant and emitted + // within the text event + break; + } + it.next(); + } + if (wasNewLine) { + // if the text ends with a newline, we need to emit the last line break + sink.markupLineBreak(indentLevel); + } + } + + /** + * @see <a href="https://developer.mozilla.org/en-US/docs/Web/CSS/Guides/Text/Whitespace#how_does_css_process_whitespace">How does CSS process whitespace?</a> + * @see <a href="https://drafts.csswg.org/css-text-4/#white-space-processing">CSS Text Module Level 4 - White Space Processing</a> + * + * @param text + * @return + */ + private static String collapseWhitespace(String text) { + // replace all sequences of whitespace characters with a single space (this includes newlines, tabs, etc.) + return text.replaceAll("\\s+", " "); + } + @Override protected void handleComment(XmlPullParser parser, Sink sink) throws XmlPullParserException { + isBeginningOfLineInsideBlock = false; String text = getText(parser); if ("PB".equals(text.trim())) { @@ -600,6 +767,7 @@ public class Xhtml5BaseParser extends AbstractXmlParser implements HtmlMarkup { @Override protected void handleCdsect(XmlPullParser parser, Sink sink) throws XmlPullParserException { + isBeginningOfLineInsideBlock = false; String text = getText(parser); if (isScriptBlock()) { diff --git a/doxia-core/src/main/java/org/apache/maven/doxia/sink/impl/SinkWrapper.java b/doxia-core/src/main/java/org/apache/maven/doxia/sink/impl/SinkWrapper.java index b8c9fff7..d82d3423 100644 --- a/doxia-core/src/main/java/org/apache/maven/doxia/sink/impl/SinkWrapper.java +++ b/doxia-core/src/main/java/org/apache/maven/doxia/sink/impl/SinkWrapper.java @@ -514,8 +514,8 @@ public class SinkWrapper extends AbstractSink { } @Override - public void comment(String comment, boolean endsWithLineBreak) { - delegate.comment(comment, endsWithLineBreak); + public void markupLineBreak(int indentLevel) { + delegate.markupLineBreak(indentLevel); } @Override diff --git a/doxia-core/src/main/java/org/apache/maven/doxia/sink/impl/Xhtml5BaseSink.java b/doxia-core/src/main/java/org/apache/maven/doxia/sink/impl/Xhtml5BaseSink.java index c4c66afa..9a5e88dc 100644 --- a/doxia-core/src/main/java/org/apache/maven/doxia/sink/impl/Xhtml5BaseSink.java +++ b/doxia-core/src/main/java/org/apache/maven/doxia/sink/impl/Xhtml5BaseSink.java @@ -1538,17 +1538,12 @@ public class Xhtml5BaseSink extends AbstractXmlSink implements HtmlMarkup { @Override public void comment(String comment) { - comment(comment, false); - } - - @Override - public void comment(String comment, boolean endsWithLineBreak) { if (comment != null) { - write(encodeAsHtmlComment(comment, endsWithLineBreak, getLocationLogPrefix())); + write(encodeAsHtmlComment(comment, getLocationLogPrefix())); } } - public static String encodeAsHtmlComment(String comment, boolean endsWithLineBreak, String locationLogPrefix) { + public static String encodeAsHtmlComment(String comment, String locationLogPrefix) { final String originalComment = comment; // http://www.w3.org/TR/2000/REC-xml-20001006#sec-comments @@ -1569,12 +1564,18 @@ public class Xhtml5BaseSink extends AbstractXmlSink implements HtmlMarkup { buffer.append(LESS_THAN).append(BANG).append(MINUS).append(MINUS); buffer.append(comment); buffer.append(MINUS).append(MINUS).append(GREATER_THAN); - if (endsWithLineBreak) { - buffer.append(EOL); - } return buffer.toString(); } + @Override + public void markupLineBreak(int indentLevel) { + if (headFlag) { + getTextBuffer().append(EOL); + } else { + write(EOL); + } + } + /** * {@inheritDoc} * diff --git a/doxia-core/src/test/java/org/apache/maven/doxia/parser/Xhtml5BaseParserTest.java b/doxia-core/src/test/java/org/apache/maven/doxia/parser/Xhtml5BaseParserTest.java index b29b8e3d..54fc5178 100644 --- a/doxia-core/src/test/java/org/apache/maven/doxia/parser/Xhtml5BaseParserTest.java +++ b/doxia-core/src/test/java/org/apache/maven/doxia/parser/Xhtml5BaseParserTest.java @@ -280,8 +280,8 @@ class Xhtml5BaseParserTest extends AbstractParserTest { el = it.next(); assertEquals("text", el.getName()); - // according to section 2.11 of the XML spec, parsers must normalize line breaks to "\n" - assertEquals("\n", (String) el.getArgs()[0]); + // the EOL must be normalized to a single space, as per the HTML spec + assertEquals(" ", (String) el.getArgs()[0]); assertEquals("inline", it.next().getName()); assertEquals("text", it.next().getName()); @@ -366,6 +366,31 @@ class Xhtml5BaseParserTest extends AbstractParserTest { assertEquals("verbatim_", it.next().getName()); } + @Test + void listWithInsignificantLineBreaks() throws Exception { + // test EOLs within lists (those don't have significance and should not be reported as text events, but as + // markupLineBreak with the according indent level) + String text = "<ul>" + Xhtml5BaseParser.EOL + " <li>One</li> " + + Xhtml5BaseParser.EOL + " <li>Two</li> " + + Xhtml5BaseParser.EOL + "</ul>"; + + parser.parse(text, sink); + + Iterator<SinkEventElement> it = sink.getEventList().iterator(); + + assertEquals("list", it.next().getName()); + assertSinkEquals(it.next(), "markupLineBreak", new Object[] {2}); + assertEquals("listItem", it.next().getName()); + assertEquals("text", it.next().getName()); + assertEquals("listItem_", it.next().getName()); + assertSinkEquals(it.next(), "markupLineBreak", new Object[] {2}); + assertEquals("listItem", it.next().getName()); + assertEquals("text", it.next().getName()); + assertEquals("listItem_", it.next().getName()); + assertSinkEquals(it.next(), "markupLineBreak", new Object[] {0}); + assertEquals("list_", it.next().getName()); + } + @Test void doxia250() throws Exception { StringBuilder sb = new StringBuilder(); diff --git a/doxia-core/src/test/java/org/apache/maven/doxia/sink/impl/AbstractSinkTest.java b/doxia-core/src/test/java/org/apache/maven/doxia/sink/impl/AbstractSinkTest.java index 022f80d0..424e52bf 100644 --- a/doxia-core/src/test/java/org/apache/maven/doxia/sink/impl/AbstractSinkTest.java +++ b/doxia-core/src/test/java/org/apache/maven/doxia/sink/impl/AbstractSinkTest.java @@ -1204,8 +1204,10 @@ public abstract class AbstractSinkTest extends AbstractModuleTest { @Test public void twoConsecutiveBlockComments() { String comment = "Simple comment"; - sink.comment(comment, true); - sink.comment(comment, true); + sink.comment(comment); + sink.markupLineBreak(0); + sink.comment(comment); + sink.markupLineBreak(0); sink.flush(); sink.close(); assertEquals(getCommentBlock(comment) + EOL + getCommentBlock(comment) + EOL, testWriter.toString()); diff --git a/doxia-core/src/test/java/org/apache/maven/doxia/sink/impl/SinkEventTestingSink.java b/doxia-core/src/test/java/org/apache/maven/doxia/sink/impl/SinkEventTestingSink.java index 3a35f8fd..714c8ebf 100644 --- a/doxia-core/src/test/java/org/apache/maven/doxia/sink/impl/SinkEventTestingSink.java +++ b/doxia-core/src/test/java/org/apache/maven/doxia/sink/impl/SinkEventTestingSink.java @@ -288,13 +288,13 @@ public class SinkEventTestingSink extends AbstractSink { } @Override - public void comment(String comment, boolean endsWithLineBreak) { - addEvent("comment", new Object[] {comment, endsWithLineBreak}); + public void comment(String comment) { + addEvent("comment", new Object[] {comment}); } @Override - public void comment(String comment) { - addEvent("comment", new Object[] {comment}); + public void markupLineBreak(int indentLevel) { + addEvent("markupLineBreak", new Object[] {indentLevel}); } @Override diff --git a/doxia-modules/doxia-module-apt/src/main/java/org/apache/maven/doxia/module/apt/AptParser.java b/doxia-modules/doxia-module-apt/src/main/java/org/apache/maven/doxia/module/apt/AptParser.java index 7f03b19d..aed398ba 100644 --- a/doxia-modules/doxia-module-apt/src/main/java/org/apache/maven/doxia/module/apt/AptParser.java +++ b/doxia-modules/doxia-module-apt/src/main/java/org/apache/maven/doxia/module/apt/AptParser.java @@ -1838,7 +1838,9 @@ public class AptParser extends AbstractTextParser implements AptMarkup { public void traverse() throws AptParseException { if (isEmitComments()) { - AptParser.this.sink.comment(text, true); + AptParser.this.sink.comment(text); + // APT comments always end with a line break + AptParser.this.sink.markupLineBreak(0); } } } diff --git a/doxia-modules/doxia-module-apt/src/main/java/org/apache/maven/doxia/module/apt/AptSink.java b/doxia-modules/doxia-module-apt/src/main/java/org/apache/maven/doxia/module/apt/AptSink.java index dab9a813..47a5a255 100644 --- a/doxia-modules/doxia-module-apt/src/main/java/org/apache/maven/doxia/module/apt/AptSink.java +++ b/doxia-modules/doxia-module-apt/src/main/java/org/apache/maven/doxia/module/apt/AptSink.java @@ -834,11 +834,6 @@ public class AptSink extends AbstractTextSink implements AptMarkup { } public void comment(String comment) { - comment(comment, false); - } - - @Override - public void comment(String comment, boolean endsWithLineBreak) { rawText("" + COMMENT + COMMENT + comment + EOL); // comments always end with a line break in APT } diff --git a/doxia-modules/doxia-module-apt/src/test/java/org/apache/maven/doxia/module/apt/AptParserTest.java b/doxia-modules/doxia-module-apt/src/test/java/org/apache/maven/doxia/module/apt/AptParserTest.java index 63b76d4d..fbf0620f 100644 --- a/doxia-modules/doxia-module-apt/src/test/java/org/apache/maven/doxia/module/apt/AptParserTest.java +++ b/doxia-modules/doxia-module-apt/src/test/java/org/apache/maven/doxia/module/apt/AptParserTest.java @@ -106,8 +106,10 @@ class AptParserTest extends AbstractParserTest { "paragraph", "text", "paragraph_"); - assertSinkEquals(it.next(), "comment", "some comment", Boolean.TRUE); - assertSinkEquals(it.next(), "comment", "another comment", Boolean.TRUE); + assertSinkEquals(it.next(), "comment", "some comment"); + assertSinkEquals(it.next(), "markupLineBreak", 0); + assertSinkEquals(it.next(), "comment", "another comment"); + assertSinkEquals(it.next(), "markupLineBreak", 0); assertSinkEquals(it, "paragraph", "text", "paragraph_", "section1_", "body_"); } diff --git a/doxia-modules/doxia-module-apt/src/test/java/org/apache/maven/doxia/module/apt/AptSinkTest.java b/doxia-modules/doxia-module-apt/src/test/java/org/apache/maven/doxia/module/apt/AptSinkTest.java index bbad9639..13d32fbe 100644 --- a/doxia-modules/doxia-module-apt/src/test/java/org/apache/maven/doxia/module/apt/AptSinkTest.java +++ b/doxia-modules/doxia-module-apt/src/test/java/org/apache/maven/doxia/module/apt/AptSinkTest.java @@ -296,8 +296,8 @@ class AptSinkTest extends AbstractSinkTest { public void twoConsecutiveBlockComments() { final Sink sink = getSink(); String comment = "Simple comment"; - sink.comment(comment, true); - sink.comment(comment, true); + sink.comment(comment); + sink.comment(comment); sink.flush(); sink.close(); assertEquals(getCommentBlock(comment) + getCommentBlock(comment), getSinkContent(), "Wrong comment!"); diff --git a/doxia-modules/doxia-module-markdown/src/main/java/org/apache/maven/doxia/module/markdown/MarkdownSink.java b/doxia-modules/doxia-module-markdown/src/main/java/org/apache/maven/doxia/module/markdown/MarkdownSink.java index 88c987a0..a8750000 100644 --- a/doxia-modules/doxia-module-markdown/src/main/java/org/apache/maven/doxia/module/markdown/MarkdownSink.java +++ b/doxia-modules/doxia-module-markdown/src/main/java/org/apache/maven/doxia/module/markdown/MarkdownSink.java @@ -107,23 +107,25 @@ public class MarkdownSink extends Xhtml5BaseSink implements MarkdownMarkup { } /** Most important contextual metadata (of elements). This contains information about necessary escaping rules, potential prefixes and newlines */ enum ElementContext { - HEAD(Type.GENERIC_CONTAINER, null, true), - BODY(Type.GENERIC_CONTAINER, ElementContext::escapeMarkdown), + HEAD(Type.GENERIC_CONTAINER, false, null, true), + BODY(Type.GENERIC_CONTAINER, true, ElementContext::escapeMarkdown), // only the elements, which affect rendering of children and are different from BODY or HEAD are listed here - FIGURE(Type.INLINE, ElementContext::escapeMarkdown, true), - HEADING(Type.LEAF_BLOCK, ElementContext::escapeMarkdown), - CODE_BLOCK(Type.LEAF_BLOCK, null), - CODE_SPAN(Type.INLINE, null, true), - TABLE_CAPTION(Type.INLINE, ElementContext::escapeMarkdown), - TABLE_ROW(Type.CONTAINER_BLOCK, null, true), + FIGURE(Type.INLINE, false, ElementContext::escapeMarkdown, true), + HEADING(Type.LEAF_BLOCK, false, ElementContext::escapeMarkdown), + CODE_BLOCK(Type.LEAF_BLOCK, false, null), + CODE_SPAN(Type.INLINE, false, null, true), + TABLE(Type.CONTAINER_BLOCK, false, null, false, "", true), + TABLE_CAPTION(Type.INLINE, false, ElementContext::escapeMarkdown), + TABLE_ROW(Type.INLINE, false, null, true), // special handling of newlines TABLE_CELL( - Type.LEAF_BLOCK, + Type.INLINE, + false, ElementContext::escapeForTableCell, false), // special type, as allows containing inlines, but not starting on a separate line // same parameters as BODY but paragraphs inside list items are handled differently - LIST_ITEM(Type.CONTAINER_BLOCK, ElementContext::escapeMarkdown, false, INDENT), - BLOCKQUOTE(Type.CONTAINER_BLOCK, ElementContext::escapeMarkdown, false, BLOCKQUOTE_START_MARKUP), - HTML_BLOCK(Type.LEAF_BLOCK, ElementContext::escapeHtml, false, "", true); + LIST_ITEM(Type.CONTAINER_BLOCK, false, ElementContext::escapeMarkdown, false, INDENT), + BLOCKQUOTE(Type.CONTAINER_BLOCK, false, ElementContext::escapeMarkdown, false, BLOCKQUOTE_START_MARKUP), + HTML_BLOCK(Type.LEAF_BLOCK, true, ElementContext::escapeHtml, false, "", true); /** * @see <a href="https://spec.commonmark.org/0.30/#blocks-and-inlines">CommonMark, 3 Blocks and inlines</a> @@ -170,26 +172,34 @@ public class MarkdownSink extends Xhtml5BaseSink implements MarkdownMarkup { * Only relevant for block element, if set to {@code true} the element requires to be surrounded by blank lines. */ final boolean requiresSurroundingByBlankLines; + + /** + * If markup linebreaks (i.e. insignificant linebreaks in the source) are allowed in this context. + * This is relevant for markdown as in some contexts (e.g. list items) linebreaks are always significant (while for HTML they wouldn't be) + */ + final boolean allowsMarkupLinebreaks; - ElementContext(Type type, TextEscapeFunction escapeFunction) { - this(type, escapeFunction, false); + ElementContext(Type type, boolean allowsMarkupLinebreaks, TextEscapeFunction escapeFunction) { + this(type, allowsMarkupLinebreaks, escapeFunction, false); } - ElementContext(Type type, TextEscapeFunction escapeFunction, boolean requiresBuffering) { - this(type, escapeFunction, requiresBuffering, ""); + ElementContext(Type type, boolean allowsMarkupLinebreaks, TextEscapeFunction escapeFunction, boolean requiresBuffering) { + this(type, allowsMarkupLinebreaks, escapeFunction, requiresBuffering, ""); } - ElementContext(Type type, TextEscapeFunction escapeFunction, boolean requiresBuffering, String prefix) { - this(type, escapeFunction, requiresBuffering, prefix, false); + ElementContext(Type type, boolean allowsMarkupLinebreaks, TextEscapeFunction escapeFunction, boolean requiresBuffering, String prefix) { + this(type, allowsMarkupLinebreaks, escapeFunction, requiresBuffering, prefix, false); } ElementContext( Type type, + boolean allowsMarkupLinebreaks, TextEscapeFunction escapeFunction, boolean requiresBuffering, String prefix, boolean requiresSurroundingByBlankLines) { this.type = type; + this.allowsMarkupLinebreaks = allowsMarkupLinebreaks; this.escapeFunction = escapeFunction; this.requiresBuffering = requiresBuffering; if (type != Type.CONTAINER_BLOCK && prefix.length() != 0) { @@ -236,6 +246,11 @@ public class MarkdownSink extends Xhtml5BaseSink implements MarkdownMarkup { return type == Type.CONTAINER_BLOCK || type == Type.GENERIC_CONTAINER; } + + public boolean isAllowsMarkupLinebreaks() { + return allowsMarkupLinebreaks; + } + /** * First use XML escaping (leveraging the predefined entities, for browsers) * afterwards escape special characters in a text with a leading backslash (for markdown parsers) @@ -756,8 +771,7 @@ public class MarkdownSink extends Xhtml5BaseSink implements MarkdownMarkup { if (elementContextStack.element().isHtml()) { super.table(attributes); } else { - ensureBlankLine(); - writeUnescaped(getLinePrefix()); + startContext(ElementContext.TABLE); } } @@ -765,6 +779,8 @@ public class MarkdownSink extends Xhtml5BaseSink implements MarkdownMarkup { public void table_() { if (elementContextStack.element().isHtml()) { super.table_(); + } else { + endContext(ElementContext.TABLE); } } @@ -1211,12 +1227,16 @@ public class MarkdownSink extends Xhtml5BaseSink implements MarkdownMarkup { @Override public void lineBreak(SinkEventAttributes attributes) { - if (elementContextStack.element() == ElementContext.CODE_BLOCK) { - writeUnescaped(EOL); + if (elementContextStack.element() == ElementContext.TABLE_CELL) { + super.lineBreak(attributes); } else { - writeUnescaped("" + SPACE + SPACE + EOL); + if (elementContextStack.element() == ElementContext.CODE_BLOCK) { + writeUnescaped(EOL); + } else { + writeUnescaped("" + SPACE + SPACE + EOL); + } + writeUnescaped(getLinePrefix()); } - writeUnescaped(getLinePrefix()); } @Override @@ -1271,6 +1291,14 @@ public class MarkdownSink extends Xhtml5BaseSink implements MarkdownMarkup { LOGGER.warn("{}Unknown Sink event '" + name + "', ignoring!", getLocationLogPrefix()); } + @Override + public void markupLineBreak(int indentLevel) { + // not allowed in all contexts + if (elementContextStack.element().isAllowsMarkupLinebreaks()) { + super.markupLineBreak(indentLevel); + } + } + protected void writeUnescaped(String text) { StringBuilder buffer = bufferStack.peek(); if (buffer != null) { @@ -1280,6 +1308,11 @@ public class MarkdownSink extends Xhtml5BaseSink implements MarkdownMarkup { } } + @Override + protected void write(String text) { + writeUnescaped(text); + } + @Override public void flush() { writer.flush(); diff --git a/doxia-modules/doxia-module-markdown/src/test/java/org/apache/maven/doxia/module/markdown/MarkdownParserTest.java b/doxia-modules/doxia-module-markdown/src/test/java/org/apache/maven/doxia/module/markdown/MarkdownParserTest.java index b3cefb07..817855e0 100644 --- a/doxia-modules/doxia-module-markdown/src/test/java/org/apache/maven/doxia/module/markdown/MarkdownParserTest.java +++ b/doxia-modules/doxia-module-markdown/src/test/java/org/apache/maven/doxia/module/markdown/MarkdownParserTest.java @@ -165,7 +165,7 @@ class MarkdownParserTest extends AbstractParserTest { "paragraph", "text", "paragraph_", - "text", + "markupLineBreak", "verbatim", "inline", "text", @@ -195,7 +195,7 @@ class MarkdownParserTest extends AbstractParserTest { "paragraph", "text", "paragraph_", - "text", + "markupLineBreak", "verbatim", "inline", "text", @@ -325,14 +325,14 @@ class MarkdownParserTest extends AbstractParserTest { "head_", "body", "list", - "text", + "markupLineBreak", "listItem", "text", "listItem_", "listItem", "text", "listItem_", - "text", + "markupLineBreak", "list_", "body_"); @@ -355,14 +355,14 @@ class MarkdownParserTest extends AbstractParserTest { "head_", "body", "numberedList", - "text", + "markupLineBreak", "numberedListItem", "text", "numberedListItem_", "numberedListItem", "text", "numberedListItem_", - "text", + "markupLineBreak", "numberedList_", "body_"); @@ -601,7 +601,7 @@ class MarkdownParserTest extends AbstractParserTest { "head_", "body", "division", - "text", + "markupLineBreak", "paragraph", "inline", "text", @@ -612,9 +612,9 @@ class MarkdownParserTest extends AbstractParserTest { "inline_", "text", "paragraph_", - "text", + "markupLineBreak", "division_", - "text", + "markupLineBreak", "horizontalRule", "section1", "sectionTitle1", @@ -623,27 +623,27 @@ class MarkdownParserTest extends AbstractParserTest { "paragraph", "text", "paragraph_", - "text", + "markupLineBreak", "table", "tableRows", - "text", + "markupLineBreak", "unknown", // tbody start "tableRow", "tableHeaderCell", "text", "tableHeaderCell_", "tableRow_", - "text", + "markupLineBreak", "tableRow", "tableCell", "text", "tableCell_", "tableRow_", - "text", + "markupLineBreak", "unknown", // tbody end "tableRows_", "table_", - "text", + "markupLineBreak", "section1_", "body_"); diff --git a/doxia-modules/doxia-module-markdown/src/test/java/org/apache/maven/doxia/module/markdown/MarkdownSinkTest.java b/doxia-modules/doxia-module-markdown/src/test/java/org/apache/maven/doxia/module/markdown/MarkdownSinkTest.java index 95c06f95..f5f06da1 100644 --- a/doxia-modules/doxia-module-markdown/src/test/java/org/apache/maven/doxia/module/markdown/MarkdownSinkTest.java +++ b/doxia-modules/doxia-module-markdown/src/test/java/org/apache/maven/doxia/module/markdown/MarkdownSinkTest.java @@ -26,6 +26,7 @@ import java.io.StringReader; import java.io.StringWriter; import java.io.Writer; +import org.apache.maven.doxia.module.xhtml5.Xhtml5Parser; import org.apache.maven.doxia.parser.ParseException; import org.apache.maven.doxia.parser.Parser; import org.apache.maven.doxia.sink.Sink; @@ -46,6 +47,10 @@ class MarkdownSinkTest extends AbstractSinkTest { @Inject protected MarkdownParser parser; + @Inject + protected Xhtml5Parser htmlParser; + + protected String outputExtension() { return "md"; } @@ -157,7 +162,7 @@ class MarkdownSinkTest extends AbstractSinkTest { + MarkdownMarkup.TABLE_CELL_SEPARATOR_MARKUP + "---" + MarkdownMarkup.TABLE_CELL_SEPARATOR_MARKUP + EOL + MarkdownMarkup.TABLE_ROW_PREFIX + cell + MarkdownMarkup.TABLE_CELL_SEPARATOR_MARKUP + cell + MarkdownMarkup.TABLE_CELL_SEPARATOR_MARKUP - + cell + MarkdownMarkup.TABLE_CELL_SEPARATOR_MARKUP + EOL; + + cell + MarkdownMarkup.TABLE_CELL_SEPARATOR_MARKUP + EOL + EOL; } @Override @@ -187,6 +192,7 @@ class MarkdownSinkTest extends AbstractSinkTest { .append("2|") .append(EOL); } + expectedMarkup.append(EOL); return expectedMarkup.toString(); } @@ -361,7 +367,11 @@ class MarkdownSinkTest extends AbstractSinkTest { } private void parseFile(Parser parser, String file, Sink sink) throws ParseException, IOException { - try (Reader reader = getTestReader(file)) { + parseFile(parser, file, outputExtension(), sink); + } + + private void parseFile(Parser parser, String file, String extension, Sink sink) throws ParseException, IOException { + try (Reader reader = getTestReader(file, extension)) { parser.parse(reader, sink); } } @@ -395,7 +405,7 @@ class MarkdownSinkTest extends AbstractSinkTest { sink.close(); String expected = - "| | |" + EOL + "|---|---|" + EOL + "|[link](target)|paragraph text with \\|**bold**|" + EOL; + "| | |" + EOL + "|---|---|" + EOL + "|[link](target)|paragraph text with \\|**bold**|" + EOL + EOL; assertEquals(expected, getSinkContent(), "Wrong link or paragraph markup in table cell"); } @@ -595,4 +605,35 @@ class MarkdownSinkTest extends AbstractSinkTest { + EOL; assertEquals(expected, getSinkContent()); } + + @Test + void listItemsContainingInsignificantWhitespace() { + try (Sink sink = getSink()) { + sink.list(); + sink.listItem(); + sink.markupLineBreak(4); + sink.text("item 1"); + sink.listItem_(); + sink.listItem(); + sink.markupLineBreak(4); + sink.text("item 2"); + sink.listItem_(); + sink.list_(); + } + String expected = "- item 1" + EOL + + "- item 2" + EOL; + assertEquals(expected, getSinkContent()); + } + + @Test + void tableWithInsignificantNewLines() throws ParseException, IOException { + parseFile(htmlParser, "table", "html", getSink()); + String expected = "|Format<br />Newline|Short description|Doxia Module|" + EOL + + "|---|---|---|" + EOL + + "|[iText](../modules/index.html#iText)|iText PDF Library|[`doxia-module-itext`](../doxia/doxia-modules/doxia-module-itext/)|" + EOL + + "|[FO](../modules/index.html#FO)<sup>\\*</sup>|XSL formatting objects \\(XSL-FO\\)|[`doxia-module-fo`](../doxia/doxia-modules/doxia-module-fo/)|" + EOL + + "|[LaTeX](../modules/index.html#LaTeX)|LaTeX typesetting system|[`doxia-module-latex`](../doxia/doxia-modules/doxia-module-latex/)|" + EOL + + "|[RTF](../modules/index.html#RTF)|Microsoft Rich Text Format|[`doxia-module-rtf`](../doxia/doxia-modules/doxia-module-rtf/)|" + EOL + EOL; + assertEquals(expected, getSinkContent()); + } } diff --git a/doxia-modules/doxia-module-markdown/src/test/resources/table.html b/doxia-modules/doxia-module-markdown/src/test/resources/table.html new file mode 100644 index 00000000..d26039df --- /dev/null +++ b/doxia-modules/doxia-module-markdown/src/test/resources/table.html @@ -0,0 +1,31 @@ +<table border="0"> + <tr> + <th>Format<br/>Newline</th> + <th>Short description</th> + <th>Doxia Module</th> + </tr> + + <tr> + <td><a href="../modules/index.html#iText">iText</a></td> + <td>iText PDF Library</td> + <td><a href="../doxia/doxia-modules/doxia-module-itext/"><code>doxia-module-itext</code></a></td> + </tr> + + <tr> + <td><a href="../modules/index.html#FO">FO</a><sup>*</sup></td> + <td>XSL formatting objects (XSL-FO)</td> + <td><a href="../doxia/doxia-modules/doxia-module-fo/"><code>doxia-module-fo</code></a></td> + </tr> + + <tr> + <td><a href="../modules/index.html#LaTeX">LaTeX</a></td> + <td>LaTeX typesetting system</td> + <td><a href="../doxia/doxia-modules/doxia-module-latex/"><code>doxia-module-latex</code></a></td> + </tr> + + <tr> + <td><a href="../modules/index.html#RTF">RTF</a></td> + <td>Microsoft Rich Text Format</td> + <td><a href="../doxia/doxia-modules/doxia-module-rtf/"><code>doxia-module-rtf</code></a></td> + </tr> + </table> diff --git a/doxia-modules/doxia-module-xdoc/src/main/java/org/apache/maven/doxia/module/xdoc/XdocParser.java b/doxia-modules/doxia-module-xdoc/src/main/java/org/apache/maven/doxia/module/xdoc/XdocParser.java index 5c646dca..677e2060 100644 --- a/doxia-modules/doxia-module-xdoc/src/main/java/org/apache/maven/doxia/module/xdoc/XdocParser.java +++ b/doxia-modules/doxia-module-xdoc/src/main/java/org/apache/maven/doxia/module/xdoc/XdocParser.java @@ -109,7 +109,7 @@ public class XdocParser extends Xhtml1BaseParser implements XdocMarkup { protected void handleStartTag(XmlPullParser parser, Sink sink) throws XmlPullParserException, MacroExecutionException { isEmptyElement = parser.isEmptyElementTag(); - + isBeginningOfLineInsideBlock = true; SinkEventAttributeSet attribs = getAttributesFromParser(parser); if (parser.getName().equals(DOCUMENT_TAG.toString())) { @@ -145,7 +145,6 @@ public class XdocParser extends Xhtml1BaseParser implements XdocMarkup { sink.head_(); this.inHead = false; } - sink.body(attribs); } else if (parser.getName().equals(SECTION_TAG.toString())) { handleSectionStart(Sink.SECTION_LEVEL_1, sink, attribs, parser); @@ -191,6 +190,7 @@ public class XdocParser extends Xhtml1BaseParser implements XdocMarkup { protected void handleEndTag(XmlPullParser parser, Sink sink) throws XmlPullParserException, MacroExecutionException { + isBeginningOfLineInsideBlock = true; if (parser.getName().equals(DOCUMENT_TAG.toString())) { // Do nothing return; diff --git a/doxia-modules/doxia-module-xdoc/src/test/java/org/apache/maven/doxia/module/xdoc/XdocParserTest.java b/doxia-modules/doxia-module-xdoc/src/test/java/org/apache/maven/doxia/module/xdoc/XdocParserTest.java index dcc9c202..f37790f0 100644 --- a/doxia-modules/doxia-module-xdoc/src/test/java/org/apache/maven/doxia/module/xdoc/XdocParserTest.java +++ b/doxia-modules/doxia-module-xdoc/src/test/java/org/apache/maven/doxia/module/xdoc/XdocParserTest.java @@ -408,6 +408,7 @@ class XdocParserTest extends AbstractParserTest { SinkEventElement styleElm = it.next(); assertEquals("unknown", styleElm.getName()); assertEquals("style", styleElm.getArgs()[0]); + assertEquals("markupLineBreak", it.next().getName()); SinkEventElement cdataElm = it.next(); assertEquals("unknown", cdataElm.getName()); assertEquals("CDATA", cdataElm.getArgs()[0]); @@ -480,4 +481,27 @@ class XdocParserTest extends AbstractParserTest { assertSinkEquals(it.next(), "text", "test", null); assertSinkEquals(it, "inline_"); } + + @Test + void indentedTags() throws Exception { + final String text = "<section name=\"test\">\n" + " <p>test</p>\n" + "</section>"; + + SinkEventTestingSink sink = new SinkEventTestingSink(); + + parser.setValidate(false); + parser.parse(text, sink); + Iterator<SinkEventElement> it = sink.getEventList().iterator(); + assertSinkEquals( + it, + "section1", + "sectionTitle1", + "text", + "sectionTitle1_", + "markupLineBreak", + "paragraph", + "text", + "paragraph_", + "markupLineBreak", + "section1_"); + } } diff --git a/doxia-sink-api/src/main/java/org/apache/maven/doxia/sink/Sink.java b/doxia-sink-api/src/main/java/org/apache/maven/doxia/sink/Sink.java index 61a199cd..972d96fe 100644 --- a/doxia-sink-api/src/main/java/org/apache/maven/doxia/sink/Sink.java +++ b/doxia-sink-api/src/main/java/org/apache/maven/doxia/sink/Sink.java @@ -1763,15 +1763,15 @@ public interface Sink extends AutoCloseable { void comment(String comment); /** - * Add a comment. The default implementation will just call {@link #comment(String)}. + * Add a single line break with the specified indentation level. The default implementation does nothing. + * This is different from emitting a line break with {@link #lineBreak(SinkEventAttributes)} or {@link #text(String, SinkEventAttributes)} as those line breaks are part of the content (i.e. affect rendering) + * while this line break is purely for pretty-printing the Sink's output and should not affect the rendering of the content. + * This is useful for Sinks that emit text-based markup languages (e.g. HTML, XML, etc.) to produce more human-readable output. * - * @param comment The comment to write. - * @param endsWithLineBreak If {@code true} comment ends with a line break, i.e. nothing else should follow on the same line + * @param indentLevel the indentation level, where 0 means no indentation, 1 means one level of indentation, etc. The sink can decide how many spaces/tabs to use for each level of indentation. * @since 2.1.0 */ - default void comment(String comment, boolean endsWithLineBreak) { - comment(comment); - } + default void markupLineBreak(int indentLevel) {} /** * Add an unknown event. This may be used by parsers to notify a general Sink about diff --git a/pom.xml b/pom.xml index f49fc441..ddc17311 100644 --- a/pom.xml +++ b/pom.xml @@ -216,6 +216,7 @@ under the License. <exclude>src/test/resources/**/*.apt</exclude> <exclude>src/test/resources/**/*.apt.vm</exclude> <exclude>src/test/resources/**/*.md</exclude> + <exclude>src/test/resources/**/*.html</exclude> <exclude>src/it/**/site/**/*.md</exclude> <exclude>src/it/**/site/**/*.markdown</exclude> </excludes>
