Tim Starling has submitted this change and it was merged. Change subject: Eliminate edge cases from p-wrapping and block tag interaction ......................................................................
Eliminate edge cases from p-wrapping and block tag interaction * Move this closer to an "ideal" DOM-based p-wrapping algorithm (in SAX land, however). Output on some of these snippets matches Parsoid's and PHP-Parser+Tidy's in rendering (based on limited testing of different snippets in a browser). * Eliminates the somewhat surprising edge cases found in T150317. * Updated one test and added three new tests. * Added a flush() call to the output stream since I was seeing some oddities based on presence/absence of debug output calls. Bug: T150317 Change-Id: I94cd5b98812403b1758a231ca1ed56d3887555c8 --- M src/main/java/org/wikimedia/html5depurate/CompatibilitySerializer.java M src/main/java/org/wikimedia/html5depurate/DepurateSerializer.java M src/test/java/org/wikimedia/html5depurate/DepuratorTest.java 3 files changed, 143 insertions(+), 27 deletions(-) Approvals: Tim Starling: Verified; Looks good to me, approved diff --git a/src/main/java/org/wikimedia/html5depurate/CompatibilitySerializer.java b/src/main/java/org/wikimedia/html5depurate/CompatibilitySerializer.java index a02a1b9..2547094 100644 --- a/src/main/java/org/wikimedia/html5depurate/CompatibilitySerializer.java +++ b/src/main/java/org/wikimedia/html5depurate/CompatibilitySerializer.java @@ -24,29 +24,36 @@ public String localName; public String qName; public Attributes attrs; - OutputStream stream; + OutputStream savedStream; public boolean needsPWrapping; public boolean isPWrapper; public boolean blank; + public boolean hasText; + public boolean split; + public int blockNestingLevel; public boolean isDisabledPWrapper; public StackEntry(String uri_, String localName_, String qName_, - Attributes attrs_, OutputStream stream_) { + Attributes attrs_, OutputStream savedStream_) { uri = uri_; localName = localName_; qName = qName_; attrs = attrs_; - stream = stream_; + savedStream = savedStream_; needsPWrapping = "body".equals(localName_) || "blockquote".equals(localName_); blank = true; + hasText = false; isPWrapper = "mw:p-wrap".equals(localName_); + blockNestingLevel = 0; + isDisabledPWrapper = false; + split = false; } } protected Stack<StackEntry> m_stack; protected DepurateSerializer m_serializer; - protected StackEntry m_currentPWrapper; + protected Stack<StackEntry> m_pStack; // Warning: this list must be in alphabetical order protected static final String[] ONLY_INLINE_ELEMENTS = {"a", "abbr", "acronym", @@ -61,12 +68,13 @@ public CompatibilitySerializer(OutputStream out) { m_stack = new Stack<StackEntry>(); + m_pStack = new Stack<StackEntry>(); m_serializer = new DepurateSerializer(out); } - private StackEntry peek() throws SAXException { + private StackEntry peek(Stack<StackEntry> stack) throws SAXException { try { - return m_stack.peek(); + return stack.peek(); } catch (EmptyStackException e) { return null; } @@ -80,12 +88,12 @@ try { StackEntry entry = m_stack.pop(); if (entry.isPWrapper) { - m_currentPWrapper = null; + m_pStack.pop(); } - ByteArrayOutputStream oldStream = + ByteArrayOutputStream entryStream = (ByteArrayOutputStream)m_serializer.getOutputStream(); - m_serializer.setOutputStream(entry.stream); - return oldStream; + m_serializer.setOutputStream(entry.savedStream); + return entryStream; } catch (EmptyStackException e) { throw new SAXException(e); } @@ -110,7 +118,7 @@ */ private StackEntry pushPWrapper() throws SAXException { StackEntry entry = push("", "mw:p-wrap", "mw:p-wrap", new AttributesImpl()); - m_currentPWrapper = entry; + m_pStack.push(entry); return entry; } @@ -127,16 +135,20 @@ public void characters(char[] chars, int start, int length) throws SAXException { - StackEntry entry = peek(); + StackEntry entry = peek(m_stack); if (entry != null) { if (entry.needsPWrapping) { entry = pushPWrapper(); } - if (entry.blank) { + if (entry.blank || !entry.hasText) { for (int i = start; i < start + length; i++) { char c = chars[i]; if (!(c == 9 || c == 10 || c == 12 || c == 13 || c == 32)) { entry.blank = false; + entry.hasText = true; + if (peek(m_pStack) != null) { + peek(m_pStack).blank = false; + } break; } } @@ -145,20 +157,106 @@ m_serializer.characters(chars, start, length); } + private void splitTagStack(boolean haveContent) throws SAXException { + StackEntry currentPWrapper = peek(m_pStack); + ByteArrayOutputStream seContent; + int n = m_stack.size(); + int i = n - 1; + StackEntry se = m_stack.get(i); + while (se != currentPWrapper) { + seContent = (ByteArrayOutputStream)m_serializer.getOutputStream(); + m_serializer.setOutputStream(se.savedStream); + + if (se.hasText) { + haveContent = true; + } + + // Emit content accumulated so far + if (haveContent) { + m_serializer.startElement(se.uri, se.localName, se.qName, se.attrs); + m_serializer.writeStream(seContent); + m_serializer.endElement(se.uri, se.localName, se.qName); + + // All text has been output at this point + // Reset the element and record that it has been split + se.split = true; + se.blank = true; + se.hasText = false; + se.savedStream = new ByteArrayOutputStream(); + } + + i--; + se = m_stack.get(i); + } + + // Dump <p>.. contents ..</p> + // Note se == currentPWrapper + if (haveContent || se.hasText) { + seContent = (ByteArrayOutputStream)m_serializer.getOutputStream(); + m_serializer.setOutputStream(se.savedStream); + + // Emit content accumulated so far + writePWrapper(se, seContent); + + // All text has been output at this point + se.blank = true; + } + + // New stream going forward + m_serializer.setOutputStream(new ByteArrayOutputStream()); + } + private boolean isOnlyInline(String localName) { return Arrays.binarySearch(ONLY_INLINE_ELEMENTS, localName) > -1; } + private void enterBlock(String tagName) throws SAXException { + // Whenever we enter a new block wrapper that is + // embedded within a p-wrapper, + // + // 1. Disable p-wrapping. + // 2. Split the tag stack and emit accumulated output + // with a p-wrapper. + + StackEntry currentPWrapper = peek(m_pStack); + + if (currentPWrapper.blockNestingLevel == 0) { + splitTagStack(false); + } + + currentPWrapper.blockNestingLevel++; + currentPWrapper.isDisabledPWrapper = true; + } + + private void leaveBlock(String tagName) throws SAXException { + // Whenever we leave the outermost block wrapper that is + // embedded within a p-wrapper, + // + // 1. Re-enable p-wrapping. + // 2. Split the tag stack and emit accumulated output + // without a p-wrapper. + + StackEntry currentPWrapper = peek(m_pStack); + currentPWrapper.blockNestingLevel--; + + if (currentPWrapper.blockNestingLevel == 0) { + splitTagStack(true); + } + + currentPWrapper.isDisabledPWrapper = false; + } + public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { - StackEntry oldEntry = peek(); + + StackEntry oldEntry = peek(m_stack); if (oldEntry != null) { if (oldEntry.isPWrapper) { if (!isOnlyInline(localName)) { // This is non-inline so close the p-wrapper ByteArrayOutputStream contents = popAndGetContents(); writePWrapper(oldEntry, contents); - oldEntry = peek(); + oldEntry = peek(m_stack); } else { // We're putting an element inside the p-wrapper, so it is non-blank now oldEntry.blank = false; @@ -168,10 +266,11 @@ } } - // Disable any ancestor p-wrapper if this is a non-inline element + // Track block nesting level boolean onlyInline = isOnlyInline(localName); - if (m_currentPWrapper != null && !onlyInline) { - m_currentPWrapper.isDisabledPWrapper = true; + StackEntry currentPWrapper = peek(m_pStack); + if (currentPWrapper != null && !onlyInline) { + enterBlock(localName); } if (oldEntry != null && oldEntry.needsPWrapping && onlyInline) { @@ -184,14 +283,14 @@ public void endElement(String uri, String localName, String qName) throws SAXException { - StackEntry entry = peek(); + StackEntry entry = peek(m_stack); ByteArrayOutputStream contents = popAndGetContents(); if (entry.isPWrapper) { // Since we made this p-wrapper, the caller really wants to end the parent element. // So first we need to close the p-wrapper writePWrapper(entry, contents); - entry = peek(); + entry = peek(m_stack); contents = popAndGetContents(); } @@ -204,9 +303,19 @@ entry.attrs = newAttrs; } } - m_serializer.startElement(entry.uri, entry.localName, entry.qName, entry.attrs); - m_serializer.writeStream(contents); - m_serializer.endElement(uri, localName, qName); + + if (!entry.split || !entry.blank) { + m_serializer.startElement(entry.uri, entry.localName, entry.qName, entry.attrs); + m_serializer.writeStream(contents); + m_serializer.endElement(uri, localName, qName); + } + + // Track block nesting level + boolean onlyInline = isOnlyInline(localName); + StackEntry currentPWrapper = peek(m_pStack); + if (currentPWrapper != null && !onlyInline) { + leaveBlock(localName); + } } public void startDocument() throws SAXException { diff --git a/src/main/java/org/wikimedia/html5depurate/DepurateSerializer.java b/src/main/java/org/wikimedia/html5depurate/DepurateSerializer.java index 43a86da..32948af 100644 --- a/src/main/java/org/wikimedia/html5depurate/DepurateSerializer.java +++ b/src/main/java/org/wikimedia/html5depurate/DepurateSerializer.java @@ -97,6 +97,7 @@ public void write(String s) throws SAXException { try { writer.write(s); + writer.flush(); } catch (IOException e) { throw new SAXException(e); } diff --git a/src/test/java/org/wikimedia/html5depurate/DepuratorTest.java b/src/test/java/org/wikimedia/html5depurate/DepuratorTest.java index 3a3defc..5b44fa3 100644 --- a/src/test/java/org/wikimedia/html5depurate/DepuratorTest.java +++ b/src/test/java/org/wikimedia/html5depurate/DepuratorTest.java @@ -73,11 +73,17 @@ // 22. p-wrapping is enabled in a blockquote in an inline element {COMPAT, "<small><blockquote>x</blockquote></small>", "<small><blockquote><p>x</p></blockquote></small>"}, - // 23. A block element inside a detected paragraph disarms - // p-wrapping for the remainder of the detected paragraph, but it - // is re-armed by a subsequent top-level block element. + // 23. All bare text should be p-wrapped even when surrounded by block tags {COMPAT, "<small><blockquote>x</blockquote></small>y<div></div>z", - "<small><blockquote><p>x</p></blockquote></small>y<div></div><p>z</p>"}, + "<small><blockquote><p>x</p></blockquote></small><p>y</p><div></div><p>z</p>"}, + // 24a, 24b, 24c. If necessary, the tag stack should be split to ensure + // that all bare text is p-wrapped correctly. + {COMPAT, "<small>x<div>y</div>z</small>", + "<p><small>x</small></p><small><div>y</div></small><p><small>z</small></p>"}, + {COMPAT, "<small><div>y</div>z</small>", + "<small><div>y</div></small><p><small>z</small></p>"}, + {COMPAT, "<small>x<div>y</div></small>", + "<p><small>x</small></p><small><div>y</div></small>"}, }); } -- To view, visit https://gerrit.wikimedia.org/r/321007 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I94cd5b98812403b1758a231ca1ed56d3887555c8 Gerrit-PatchSet: 6 Gerrit-Project: mediawiki/services/html5depurate Gerrit-Branch: master Gerrit-Owner: Subramanya Sastry <ssas...@wikimedia.org> Gerrit-Reviewer: C. Scott Ananian <canan...@wikimedia.org> Gerrit-Reviewer: Subramanya Sastry <ssas...@wikimedia.org> Gerrit-Reviewer: Tim Starling <tstarl...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits