Tim Starling has submitted this change and it was merged.

Change subject: Eliminate edge cases from p-wrapping and block tag interaction
......................................................................


Eliminate edge cases from p-wrapping and block tag interaction

* Move this closer to an "ideal" DOM-based p-wrapping algorithm
  (in SAX land, however). Output on some of these snippets matches
  Parsoid's and PHP-Parser+Tidy's in rendering (based on limited
  testing of different snippets in a browser).

* Eliminates the somewhat surprising edge cases found in T150317.

* Updated one test and added three new tests.

* Added a flush() call to the output stream since I was seeing some
  oddities based on presence/absence of debug output calls.

Bug: T150317
Change-Id: I94cd5b98812403b1758a231ca1ed56d3887555c8
---
M src/main/java/org/wikimedia/html5depurate/CompatibilitySerializer.java
M src/main/java/org/wikimedia/html5depurate/DepurateSerializer.java
M src/test/java/org/wikimedia/html5depurate/DepuratorTest.java
3 files changed, 143 insertions(+), 27 deletions(-)

Approvals:
  Tim Starling: Verified; Looks good to me, approved



diff --git 
a/src/main/java/org/wikimedia/html5depurate/CompatibilitySerializer.java 
b/src/main/java/org/wikimedia/html5depurate/CompatibilitySerializer.java
index a02a1b9..2547094 100644
--- a/src/main/java/org/wikimedia/html5depurate/CompatibilitySerializer.java
+++ b/src/main/java/org/wikimedia/html5depurate/CompatibilitySerializer.java
@@ -24,29 +24,36 @@
                public String localName;
                public String qName;
                public Attributes attrs;
-               OutputStream stream;
+               OutputStream savedStream;
                public boolean needsPWrapping;
                public boolean isPWrapper;
                public boolean blank;
+               public boolean hasText;
+               public boolean split;
+               public int blockNestingLevel;
                public boolean isDisabledPWrapper;
 
                public StackEntry(String uri_, String localName_, String qName_,
-                               Attributes attrs_, OutputStream stream_) {
+                               Attributes attrs_, OutputStream savedStream_) {
                        uri = uri_;
                        localName = localName_;
                        qName = qName_;
                        attrs = attrs_;
-                       stream = stream_;
+                       savedStream = savedStream_;
                        needsPWrapping = "body".equals(localName_)
                                || "blockquote".equals(localName_);
                        blank = true;
+                       hasText = false;
                        isPWrapper = "mw:p-wrap".equals(localName_);
+                       blockNestingLevel = 0;
+                       isDisabledPWrapper = false;
+                       split = false;
                }
        }
 
        protected Stack<StackEntry> m_stack;
        protected DepurateSerializer m_serializer;
-       protected StackEntry m_currentPWrapper;
+       protected Stack<StackEntry> m_pStack;
 
        // Warning: this list must be in alphabetical order
        protected static final String[] ONLY_INLINE_ELEMENTS = {"a", "abbr", 
"acronym",
@@ -61,12 +68,13 @@
 
        public CompatibilitySerializer(OutputStream out) {
                m_stack = new Stack<StackEntry>();
+               m_pStack = new Stack<StackEntry>();
                m_serializer = new DepurateSerializer(out);
        }
 
-       private StackEntry peek() throws SAXException {
+       private StackEntry peek(Stack<StackEntry> stack) throws SAXException {
                try {
-                       return m_stack.peek();
+                       return stack.peek();
                } catch (EmptyStackException e) {
                        return null;
                }
@@ -80,12 +88,12 @@
                try {
                        StackEntry entry = m_stack.pop();
                        if (entry.isPWrapper) {
-                               m_currentPWrapper = null;
+                               m_pStack.pop();
                        }
-                       ByteArrayOutputStream oldStream =
+                       ByteArrayOutputStream entryStream =
                                
(ByteArrayOutputStream)m_serializer.getOutputStream();
-                       m_serializer.setOutputStream(entry.stream);
-                       return oldStream;
+                       m_serializer.setOutputStream(entry.savedStream);
+                       return entryStream;
                } catch (EmptyStackException e) {
                        throw new SAXException(e);
                }
@@ -110,7 +118,7 @@
         */
        private StackEntry pushPWrapper() throws SAXException {
                StackEntry entry = push("", "mw:p-wrap", "mw:p-wrap", new 
AttributesImpl());
-               m_currentPWrapper = entry;
+               m_pStack.push(entry);
                return entry;
        }
 
@@ -127,16 +135,20 @@
 
        public void characters(char[] chars, int start, int length)
                        throws SAXException {
-               StackEntry entry = peek();
+               StackEntry entry = peek(m_stack);
                if (entry != null) {
                        if (entry.needsPWrapping) {
                                entry = pushPWrapper();
                        }
-                       if (entry.blank) {
+                       if (entry.blank || !entry.hasText) {
                                for (int i = start; i < start + length; i++) {
                                        char c = chars[i];
                                        if (!(c == 9 || c == 10 || c == 12 || c 
== 13 || c == 32)) {
                                                entry.blank = false;
+                                               entry.hasText = true;
+                                               if (peek(m_pStack) != null) {
+                                                       peek(m_pStack).blank = 
false;
+                                               }
                                                break;
                                        }
                                }
@@ -145,20 +157,106 @@
                m_serializer.characters(chars, start, length);
        }
 
+       private void splitTagStack(boolean haveContent) throws SAXException {
+               StackEntry currentPWrapper = peek(m_pStack);
+               ByteArrayOutputStream seContent;
+               int n = m_stack.size();
+               int i = n - 1;
+               StackEntry se = m_stack.get(i);
+               while (se != currentPWrapper) {
+                       seContent = 
(ByteArrayOutputStream)m_serializer.getOutputStream();
+                       m_serializer.setOutputStream(se.savedStream);
+
+                       if (se.hasText) {
+                               haveContent = true;
+                       }
+
+                       // Emit content accumulated so far
+                       if (haveContent) {
+                               m_serializer.startElement(se.uri, se.localName, 
se.qName, se.attrs);
+                               m_serializer.writeStream(seContent);
+                               m_serializer.endElement(se.uri, se.localName, 
se.qName);
+
+                               // All text has been output at this point
+                               // Reset the element and record that it has 
been split
+                               se.split = true;
+                               se.blank = true;
+                               se.hasText = false;
+                               se.savedStream = new ByteArrayOutputStream();
+                       }
+
+                       i--;
+                       se = m_stack.get(i);
+               }
+
+               // Dump <p>.. contents ..</p>
+               // Note se == currentPWrapper
+               if (haveContent || se.hasText) {
+                       seContent = 
(ByteArrayOutputStream)m_serializer.getOutputStream();
+                       m_serializer.setOutputStream(se.savedStream);
+
+                       // Emit content accumulated so far
+                       writePWrapper(se, seContent);
+
+                       // All text has been output at this point
+                       se.blank = true;
+               }
+
+               // New stream going forward
+               m_serializer.setOutputStream(new ByteArrayOutputStream());
+       }
+
        private boolean isOnlyInline(String localName) {
                return Arrays.binarySearch(ONLY_INLINE_ELEMENTS, localName) > 
-1;
        }
 
+       private void enterBlock(String tagName) throws SAXException {
+               // Whenever we enter a new block wrapper that is
+               // embedded within a p-wrapper,
+               //
+               // 1. Disable p-wrapping.
+               // 2. Split the tag stack and emit accumulated output
+               //    with a p-wrapper.
+
+               StackEntry currentPWrapper = peek(m_pStack);
+
+               if (currentPWrapper.blockNestingLevel == 0) {
+                       splitTagStack(false);
+               }
+
+               currentPWrapper.blockNestingLevel++;
+               currentPWrapper.isDisabledPWrapper = true;
+       }
+
+       private void leaveBlock(String tagName) throws SAXException {
+               // Whenever we leave the outermost block wrapper that is
+               // embedded within a p-wrapper,
+               //
+               // 1. Re-enable p-wrapping.
+               // 2. Split the tag stack and emit accumulated output
+               //    without a p-wrapper.
+
+               StackEntry currentPWrapper = peek(m_pStack);
+               currentPWrapper.blockNestingLevel--;
+
+               if (currentPWrapper.blockNestingLevel == 0) {
+                       splitTagStack(true);
+               }
+
+               currentPWrapper.isDisabledPWrapper = false;
+       }
+
        public void startElement(String uri, String localName, String qName,
                        Attributes atts) throws SAXException {
-               StackEntry oldEntry = peek();
+
+               StackEntry oldEntry = peek(m_stack);
                if (oldEntry != null) {
                        if (oldEntry.isPWrapper) {
                                if (!isOnlyInline(localName)) {
                                        // This is non-inline so close the 
p-wrapper
                                        ByteArrayOutputStream contents = 
popAndGetContents();
                                        writePWrapper(oldEntry, contents);
-                                       oldEntry = peek();
+                                       oldEntry = peek(m_stack);
                                } else {
                                        // We're putting an element inside the 
p-wrapper, so it is non-blank now
                                        oldEntry.blank = false;
@@ -168,10 +266,11 @@
                        }
                }
 
-               // Disable any ancestor p-wrapper if this is a non-inline 
element
+               // Track block nesting level
                boolean onlyInline = isOnlyInline(localName);
-               if (m_currentPWrapper != null && !onlyInline) {
-                       m_currentPWrapper.isDisabledPWrapper = true;
+               StackEntry currentPWrapper = peek(m_pStack);
+               if (currentPWrapper != null && !onlyInline) {
+                       enterBlock(localName);
                }
 
                if (oldEntry != null && oldEntry.needsPWrapping && onlyInline) {
@@ -184,14 +283,14 @@
 
        public void endElement(String uri, String localName, String qName)
                        throws SAXException {
-               StackEntry entry = peek();
+               StackEntry entry = peek(m_stack);
                ByteArrayOutputStream contents = popAndGetContents();
 
                if (entry.isPWrapper) {
                        // Since we made this p-wrapper, the caller really 
wants to end the parent element.
                        // So first we need to close the p-wrapper
                        writePWrapper(entry, contents);
-                       entry = peek();
+                       entry = peek(m_stack);
                        contents = popAndGetContents();
                }
 
@@ -204,9 +303,19 @@
                                entry.attrs = newAttrs;
                        }
                }
-               m_serializer.startElement(entry.uri, entry.localName, 
entry.qName, entry.attrs);
-               m_serializer.writeStream(contents);
-               m_serializer.endElement(uri, localName, qName);
+
+               if (!entry.split || !entry.blank) {
+                       m_serializer.startElement(entry.uri, entry.localName, 
entry.qName, entry.attrs);
+                       m_serializer.writeStream(contents);
+                       m_serializer.endElement(uri, localName, qName);
+               }
+
+               // Track block nesting level
+               boolean onlyInline = isOnlyInline(localName);
+               StackEntry currentPWrapper = peek(m_pStack);
+               if (currentPWrapper != null && !onlyInline) {
+                       leaveBlock(localName);
+               }
        }
 
        public void startDocument() throws SAXException {
diff --git a/src/main/java/org/wikimedia/html5depurate/DepurateSerializer.java 
b/src/main/java/org/wikimedia/html5depurate/DepurateSerializer.java
index 43a86da..32948af 100644
--- a/src/main/java/org/wikimedia/html5depurate/DepurateSerializer.java
+++ b/src/main/java/org/wikimedia/html5depurate/DepurateSerializer.java
@@ -97,6 +97,7 @@
        public void write(String s) throws SAXException {
                try {
                        writer.write(s);
+                       writer.flush();
                } catch (IOException e) {
                        throw new SAXException(e);
                }
diff --git a/src/test/java/org/wikimedia/html5depurate/DepuratorTest.java 
b/src/test/java/org/wikimedia/html5depurate/DepuratorTest.java
index 3a3defc..5b44fa3 100644
--- a/src/test/java/org/wikimedia/html5depurate/DepuratorTest.java
+++ b/src/test/java/org/wikimedia/html5depurate/DepuratorTest.java
@@ -73,11 +73,17 @@
                        // 22. p-wrapping is enabled in a blockquote in an 
inline element
                        {COMPAT,  "<small><blockquote>x</blockquote></small>",
                                
"<small><blockquote><p>x</p></blockquote></small>"},
-                       // 23. A block element inside a detected paragraph 
disarms
-                       // p-wrapping for the remainder of the detected 
paragraph, but it
-                       // is re-armed by a subsequent top-level block element.
+                       // 23. All bare text should be p-wrapped even when 
surrounded by block tags
                        {COMPAT, 
"<small><blockquote>x</blockquote></small>y<div></div>z",
-                               
"<small><blockquote><p>x</p></blockquote></small>y<div></div><p>z</p>"},
+                               
"<small><blockquote><p>x</p></blockquote></small><p>y</p><div></div><p>z</p>"},
+                       // 24a, 24b, 24c. If necessary, the tag stack should be 
split to ensure
+                       //                that all bare text is p-wrapped 
correctly.
+                       {COMPAT, "<small>x<div>y</div>z</small>",
+                               
"<p><small>x</small></p><small><div>y</div></small><p><small>z</small></p>"},
+                       {COMPAT, "<small><div>y</div>z</small>",
+                               
"<small><div>y</div></small><p><small>z</small></p>"},
+                       {COMPAT, "<small>x<div>y</div></small>",
+                               
"<p><small>x</small></p><small><div>y</div></small>"},
                });
        }
 

-- 
To view, visit https://gerrit.wikimedia.org/r/321007
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I94cd5b98812403b1758a231ca1ed56d3887555c8
Gerrit-PatchSet: 6
Gerrit-Project: mediawiki/services/html5depurate
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <ssas...@wikimedia.org>
Gerrit-Reviewer: C. Scott Ananian <canan...@wikimedia.org>
Gerrit-Reviewer: Subramanya Sastry <ssas...@wikimedia.org>
Gerrit-Reviewer: Tim Starling <tstarl...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to