[MediaWiki-commits] [Gerrit] Basic implementation of purely DOM-based separator handling - change (mediawiki...Parsoid)

Subramanya Sastry (Code Review) Thu, 21 Feb 2013 10:28:10 -0800

Subramanya Sastry has submitted this change and it was merged.

Change subject: Basic implementation of purely DOM-based separator handling
......................................................................



Basic implementation of purely DOM-based separator handling

This patch set implements a DOM-based separator handling pipeline, but does
not enable it by default yet.

Each element handler can now define a sepNls object like this:

sepNls: {
    before: function(node) { if(node).... (logic).. return {min: 1}; },
    after: id({min: 1, max: 2}),
    firstChild: id({min: 1}),
    lastchild: id({min: 1})
}

The passed-in 'other' element node allows a customization of the result for
pairs of elements etc.

The code path can be exercised and seems to work ok, but is not yet used.
You can comment out the this.testHandleSeparator call in line 2926 to get some
debug prints.

Also, fix JSHint errors in WikitextSerializer and define proper exports in
defines in preparation of moving away from globals.

No change in parserTests, as the new separator code is not used yet.

Change-Id: Ie0aa7530918da9eb584f2039d49adceeb9f56f5f
---
M js/lib/mediawiki.WikitextSerializer.js
M js/lib/mediawiki.parser.defines.js
2 files changed, 325 insertions(+), 93 deletions(-)

Approvals:
  Subramanya Sastry: Verified; Looks good to me, approved



diff --git a/js/lib/mediawiki.WikitextSerializer.js 
b/js/lib/mediawiki.WikitextSerializer.js
index ff38205..564a635 100644
--- a/js/lib/mediawiki.WikitextSerializer.js
+++ b/js/lib/mediawiki.WikitextSerializer.js
@@ -28,9 +28,9 @@
 var PegTokenizer = require('./mediawiki.tokenizer.peg.js').PegTokenizer,
        wtConsts = require('./mediawiki.wikitext.constants.js'),
        WikitextConstants = wtConsts.WikitextConstants,
-       Node = wtConsts.Node,
        Util = require('./mediawiki.Util.js').Util,
        DU = require('./mediawiki.DOMUtils.js').DOMUtils,
+       pd = require('./mediawiki.parser.defines.js'),
        SanitizerConstants = 
require('./ext.core.Sanitizer.js').SanitizerConstants,
        tagWhiteListHash;
 
@@ -48,6 +48,15 @@
 
 function isHtmlBlockTag(name) {
        return name === 'body' || Util.isBlockTag(name);
+}
+
+function isTd(token) {
+       return token && token.constructor === pd.TagTk && token.name === 'td';
+}
+
+function isListItem(token) {
+       return token && token.constructor === pd.TagTk &&
+               ['li', 'dt', 'dd'].indexOf(token.name) !== -1;
 }
 
 var WikitextEscapeHandlers = function() { };
@@ -142,7 +151,7 @@
        // The code below will break if use async tokenization.
        p.processSync( prefixedText );
 
-       // If the token stream has a TagTk, SelfclosingTagTk, EndTagTk or 
CommentTk
+       // If the token stream has a pd.TagTk, pd.SelfclosingTagTk, pd.EndTagTk 
or pd.CommentTk
        // then this text needs escaping!
        var tagWhiteList = getTagWhiteList();
        var numEntities = 0;
@@ -155,7 +164,7 @@
                }
 
                var tc = t.constructor;
-               if (tc === SelfclosingTagTk) {
+               if (tc === pd.SelfclosingTagTk) {
                        // Ignore extlink tokens without valid urls
                        if (t.name === 'extlink' && 
!this.urlParser.tokenizeURL(t.getAttribute("href"))) {
                                continue;
@@ -171,7 +180,7 @@
                        }
                }
 
-               if (!linksOnly && tc === TagTk) {
+               if (!linksOnly && tc === pd.TagTk) {
                        // mw:Entity tokens
                        if (t.name === 'span' && t.getAttribute('typeof') === 
'mw:Entity') {
                                numEntities++;
@@ -181,7 +190,7 @@
                        return true;
                }
 
-               if (!linksOnly && tc === EndTagTk) {
+               if (!linksOnly && tc === pd.EndTagTk) {
                        // mw:Entity tokens
                        if (numEntities > 0 && t.name === 'span') {
                                numEntities--;
@@ -345,7 +354,7 @@
                if ( wtEscaper ) {
                        this.wteHandlerStack.pop();
                }
-               state.chunkCB = origChunkCB;
+               this.chunkCB = origChunkCB;
        },
 
        serializeDOMToString: function(node, wtEscaper) {
@@ -458,22 +467,13 @@
        return function(state, token) {
                var prevToken = state.prevToken;
                // Deal with empty headings. Ex: <h1></h1>
-               if (prevToken.constructor === TagTk && prevToken.name === 
token.name) {
+               if (prevToken.constructor === pd.TagTk && prevToken.name === 
token.name) {
                        return "<nowiki></nowiki>" + v;
                } else {
                        return v;
                }
        };
 };
-
-function isTd(token) {
-       return token && token.constructor === TagTk && token.name === 'td';
-}
-
-function isListItem(token) {
-       return token && token.constructor === TagTk &&
-               ['li', 'dt', 'dd'].indexOf(token.name) !== -1;
-}
 
 function isMultilineListItem(token) {
        return isListItem(token) && token.dataAttribs.stx !== 'row';
@@ -514,7 +514,7 @@
        // Template and template-arg markers are escaped unconditionally!
        // Conditional escaping requires matching brace pairs and knowledge
        // of whether we are in template arg context or not.
-       if (text.match(/{{{|{{|}}}|}}/)) {
+       if (text.match(/\{\{\{|\{\{|\}\}\}|\}\}/)) {
                // console.warn("---EWT:F3---");
                return escapedText(text);
        }
@@ -642,14 +642,14 @@
 WSP._listItemHandler = function ( handler, bullet, state, token ) {
 
        function isRepeatToken(state, token) {
-               return  state.prevToken.constructor === EndTagTk &&
+               return  state.prevToken.constructor === pd.EndTagTk &&
                                state.prevToken.name === token.name;
        }
 
        function isMultiLineDtDdPair(state, token) {
                return  token.name === 'dd' &&
                                token.dataAttribs.stx !== 'row' &&
-                               state.prevTagToken.constructor === EndTagTk &&
+                               state.prevTagToken.constructor === pd.EndTagTk 
&&
                                state.prevTagToken.name === 'dt';
        }
 
@@ -827,8 +827,8 @@
                close = '/';
        }
 
-       var sAttribs = WSP._serializeAttributes(state, token);
-       var tokenName = da.srcTagName || token.name;
+       var sAttribs = WSP._serializeAttributes(state, token),
+               tokenName = da.srcTagName || token.name;
        if (sAttribs.length > 0) {
                return '<' + tokenName + ' ' + sAttribs + close + '>';
        } else {
@@ -1208,7 +1208,7 @@
  *     the dom subtree).
  * ********************************************************************* */
 function id(v) {
-       return function( state ) {
+       return function() {
                return v;
        };
 }
@@ -1332,7 +1332,7 @@
                                // If the token has 'startTagSrc' set, it means 
that the tr was present
                                // in the source wikitext and we emit it -- if 
not, we ignore it.
                                var da = token.dataAttribs;
-                               if (state.prevToken.constructor === TagTk &&
+                               if (state.prevToken.constructor === pd.TagTk &&
                                        state.prevToken.name === 'tbody' &&
                                        !da.startTagSrc )
                                {
@@ -1385,8 +1385,8 @@
                        var prevToken = state.prevToken;
                        if (    token.attribs.length === 0 &&
                                        (       (state.listStack.length > 0 && 
isListItem(prevToken)) ||
-                                               (prevToken.constructor === 
TagTk && prevToken.name === 'td') ||
-                                               (state.ignorePTag && 
token.constructor === EndTagTk)))
+                                               (prevToken.constructor === 
pd.TagTk && prevToken.name === 'td') ||
+                                               (state.ignorePTag && 
token.constructor === pd.EndTagTk)))
                        {
                                state.ignorePTag = !state.ignorePTag;
                                return { start: { ignore: true }, end: { 
ignore: true } };
@@ -1399,7 +1399,7 @@
                        handle: function(state, token) {
                                var prevTag = state.prevTagToken;
                                if (state.env.page.src || (
-                                       prevTag && prevTag.constructor === 
TagTk && prevTag.name === 'body')) {
+                                       prevTag && prevTag.constructor === 
pd.TagTk && prevTag.name === 'body')) {
                                        this.emitsNL = false;
                                } else {
                                        this.emitsNL = true;
@@ -1410,8 +1410,8 @@
                end: {
                        handle: function(state, token) {
                                var prevTag = state.prevToken;
-                               if (prevTag && ((prevTag.constructor === TagTk 
&& prevTag.name === 'p') ||
-                                       (prevTag.constructor === EndTagTk 
&&prevTag.name === 'br'))) {
+                               if (prevTag && ((prevTag.constructor === 
pd.TagTk && prevTag.name === 'p') ||
+                                       (prevTag.constructor === pd.EndTagTk 
&&prevTag.name === 'br'))) {
                                        this.endsLine = true;
                                        this.emitsNL = false;
                                } else if (state.env.page.src) {
@@ -1422,6 +1422,16 @@
                                        this.emitsNL = true;
                                }
                                return '';
+                       }
+               },
+               sepNls: {
+                       before: function(otherNode) {
+                               return otherNode.nodeName === 'P' ?
+                                       {min: 2, max: 2} : {min: 1, max: 2};
+                       },
+                       after: function(otherNode) {
+                               return otherNode.nodeName === 'P' ?
+                                       {min: 2, max: 2} : {min: 1, max: 1};
                        }
                }
        },
@@ -1440,7 +1450,7 @@
                                };
 
                                var prevTagToken = state.prevTagToken;
-                               if (!state.env.page.src && prevTagToken && 
prevTagToken.constructor === EndTagTk &&
+                               if (!state.env.page.src && prevTagToken && 
prevTagToken.constructor === pd.EndTagTk &&
                                        prevTagToken.name === 'pre' && 
prevTagToken.dataAttribs.stx !== 'html')
                                {
                                        return '\n ';
@@ -1494,7 +1504,7 @@
                                                        return '';
                                                default:
                                                        this.solTransparent = 
false;
-                                                       return 
WSP._serializeHTMLTag( state, token );
+                                                       return 
WSP._serializeHTMLTag( state, token ) || '';
                                        }
                                } else if ( argDict.property ) {
                                        switchType = argDict.property.match( 
/^mw\:PageProp\/(.*)$/ );
@@ -1504,6 +1514,8 @@
                                                        switchType = 
token.dataAttribs.magicSrc;
                                                }
                                                return switchType;
+                                       } else {
+                                               return '';
                                        }
                                } else {
                                        return WSP._serializeHTMLTag( state, 
token );
@@ -1786,14 +1798,14 @@
 
        cb( WSP._serializeHTMLTag(
                                state,
-                               new TagTk(node.nodeName.toLowerCase(), 
attribKVs, node.data.parsoid)
+                               new pd.TagTk(node.nodeName.toLowerCase(), 
attribKVs, node.data.parsoid)
          ) );
        if (node.childNodes.length) {
                state.serializeChildren(node.childNodes, cb);
        }
        cb( WSP._serializeHTMLEndTag(
                                state,
-                               new EndTagTk(node.nodeName.toLowerCase(), 
attribKVs, node.data.parsoid)
+                               new pd.EndTagTk(node.nodeName.toLowerCase(), 
attribKVs, node.data.parsoid)
                                ) );
 };
 
@@ -1810,7 +1822,7 @@
                if (nodeTypeOf === "mw:TemplateSource") {
                        return {
                                handle: function () {
-                                       cb(dp.src)
+                                       cb(dp.src);
                                },
                                isTemplateSrc: true
                        };
@@ -1819,7 +1831,7 @@
                        // return src, and drop the generated content
                        return {
                                handle: function() {
-                                       cb(dp.src)
+                                       cb(dp.src);
                                }
                        };
                } else if (nodeTypeOf === "mw:Entity") {
@@ -1836,7 +1848,6 @@
                }
        }
        if (nodeName === 'span' && nodeTypeOf === 'mw:Image') {
-               console.log('img span');
                // Hack: forward this span to DOM-based link handler until the 
span
                // handler is fully DOM-based.
 
@@ -1878,7 +1889,7 @@
        if (token.isHTMLTag() ||
                        (
                         // Inherit stx: html for new elements from parent in 
some cases
-                               ( token.constructor === TagTk || 
token.constructor === EndTagTk ) &&
+                               ( token.constructor === pd.TagTk || 
token.constructor === pd.EndTagTk ) &&
                                // new element
                                Object.keys(token.dataAttribs).length === 0 &&
                                token.name !== 'meta' &&
@@ -1896,7 +1907,7 @@
        if ( ! handler ) {
                handler = this.defaultHTMLTagHandler;
        }
-       if ( token.constructor === TagTk || token.constructor === 
SelfclosingTagTk ) {
+       if ( token.constructor === pd.TagTk || token.constructor === 
pd.SelfclosingTagTk ) {
                // XXX: This looks like a slightly strange side-effect for
                // getTokenHandler
                state.wteHandlerStack.push(handler.wtEscapeHandler || null);
@@ -1938,8 +1949,8 @@
        var textHandler = state.textHandler;
 
        switch( token.constructor ) {
-               case TagTk:
-               case SelfclosingTagTk:
+               case pd.TagTk:
+               case pd.SelfclosingTagTk:
                        handler = WSP._getTokenHandler( state, token );
                        if ( ! handler.ignore ) {
                                state.prevTagToken = state.currTagToken;
@@ -1955,14 +1966,17 @@
                                }
                        }
 
-                       // SSS FIXME: There are no SelfclosingTagTk types 
constructed
+                       // SSS FIXME: There are no pd.SelfclosingTagTk types 
constructed
                        // right now and can be removed to simplify the code 
and logic.
-                       if (token.constructor === SelfclosingTagTk) {
+                       if (token.constructor === pd.SelfclosingTagTk) {
                                state.wteHandlerStack.pop();
                        }
                        break;
-               case EndTagTk:
+               case pd.EndTagTk:
                        handler = WSP._getTokenHandler( state, token );
+                       // XXX: Return the wteHandler along with the handler 
instead of
+                       // pushing it as a side effect. Avoids the asymmetrical 
popping
+                       // here.
                        state.wteHandlerStack.pop();
                        if ( ! handler.ignore ) {
                                state.prevTagToken = state.currTagToken;
@@ -1997,17 +2011,17 @@
                                state.bufferedSeparator = null;
                        }
                        break;
-               case CommentTk:
+               case pd.CommentTk:
                        res = '<!--' + token.value + '-->';
                        // don't consider comments for changes of the 
onStartOfLine status
                        // XXX: convert all non-tag handlers to a similar 
handler
                        // structure as tags?
                        handler = { solTransparent: true };
                        break;
-               case NlTk:
+               case pd.NlTk:
                        res = textHandler ? textHandler( state, '\n' ) : '\n';
                        break;
-               case EOFTk:
+               case pd.EOFTk:
                        res = '';
                        state.chunkCB( res );
                        break;
@@ -2139,6 +2153,72 @@
        }
 };
 
+
+
+/**
+ * Starting on a text or comment node, collect ws text / comments between
+ * elements.
+ *
+ * Assumptions:
+ * - Called on first text / comment node
+ *
+ * Returns either null if there is no separator, or
+ * {
+ *       nextElement: (next element node),
+ *       sepText: the combined separator text
+ * }
+ */
+WSP.gatherSeparatorText = function ( node ) {
+       var startIsFirstChild = !node.previousSibling;
+       if (startIsFirstChild || node.previousSibling.nodeType === 
node.ELEMENT_NODE)
+       {
+               var out = [];
+               while(node) {
+                       if (node.nodeType === node.TEXT_NODE) {
+                               if (node.nodeValue.match(/^\s*$/)) {
+                                       out.push(node.nodeValue);
+                               } else {
+                                       // not a separator between elements, 
bail out
+                                       return null;
+                               }
+                       } else if (node.nodeType === node.COMMENT_NODE) {
+                               out.push('<!--' + node.nodeValue + '-->');
+                       } else if (node.nodeType === node.ELEMENT_NODE) {
+                               if(DU.isMarkerMeta(node, 'mw:DiffMarker')) {
+                                       // ignore it for the separator, but
+                                       // TODO: call chunkCB!!
+                               } else {
+                                       // End of separator run
+                                       return {
+                                               nextElement: node,
+                                               sepSrc: out.join('')
+                                       };
+                               }
+                       }
+                       if (node.nextSibling) {
+                               // Continue gathering.
+                               node = node.nextSibling;
+                       } else if (startIsFirstChild) {
+                               // All children were separator-like, but since 
they don't
+                               // separate two elements (no element node found 
and first node
+                               // was the first child) they are not 
inter-element whitespace.
+                               // Abort match.
+                               return null;
+                       } else {
+                               // Separator between some child and parent
+                               return {
+                                       nextElement: node.parentNode,
+                                       sepSrc: out.join('')
+                               };
+                       }
+               }
+       } else {
+               // No separator.
+               return null;
+       }
+};
+
+
 // 1. Update state with the set of templated attributes.
 // 2. Strip non-semantic leading and trailing newlines.
 WSP.preprocessDOM = function(node, state, inPre, haveOrigSrc) {
@@ -2186,7 +2266,7 @@
                        }
 
                        // Remove it from the DOM
-                       node.parentNode.removeChild(node);
+                       //node.parentNode.removeChild(node);
                }
        } else {
                var about = DU.isElt(node) ? node.getAttribute("about") || "" : 
"";
@@ -2204,9 +2284,11 @@
                        // Collapse a sequence of text nodes and delete empty 
text nodes
                        // NOTE: We could have used body.normalize() and got 
this for free,
                        // but JSDOM is buggy and strips empty comments.
-                       if (child.nodeType === Node.TEXT_NODE) {
+                       // Domino actually exhibits the same behavior. Probably 
a
+                       // misfeature in the spec.
+                       if (child.nodeType === node.TEXT_NODE) {
                                var buf = [child.data];
-                               while (next && next.nodeType === 
Node.TEXT_NODE) {
+                               while (next && next.nodeType === 
node.TEXT_NODE) {
                                        var nextnext = next.nextSibling;
                                        buf.push(next.data);
                                        node.removeChild(next);
@@ -2249,14 +2331,14 @@
                                next = child.nextSibling;
 
                                // Delete empty text nodes
-                               if (nodeType === Node.TEXT_NODE && child.data 
=== '') {
+                               if (nodeType === node.TEXT_NODE && child.data 
=== '') {
                                        node.removeChild(child);
                                        child = next;
                                        continue;
                                }
 
                                if (!haveOrigSrc) {
-                                       if (nodeType === Node.TEXT_NODE) {
+                                       if (nodeType === node.TEXT_NODE) {
                                                str = child.data;
                                                // Strip leading/trailing 
newlines if preceded by or
                                                // followed by block nodes -- 
these newlines are syntactic
@@ -2271,7 +2353,7 @@
                                        }
                                } else {
                                        switch (nodeType) {
-                                               case Node.TEXT_NODE:
+                                               case node.TEXT_NODE:
                                                        str = child.data;
                                                        if (!waitForSentinel && 
str.match(/^\s+$/)) {
                                                                
sepText.push(str);
@@ -2282,7 +2364,7 @@
                                                        }
                                                        break;
 
-                                               case Node.COMMENT_NODE:
+                                               case node.COMMENT_NODE:
                                                        if (!waitForSentinel) {
                                                                
sepText.push("<!--");
                                                                
sepText.push(child.data);
@@ -2291,7 +2373,7 @@
                                                        }
                                                        break;
 
-                                               case Node.ELEMENT_NODE:
+                                               case node.ELEMENT_NODE:
                                                        if (!waitForSentinel && 
DU.isMarkerMeta(child, "mw:DiffMarker")) {
                                                                // Float 
"mw:DiffMarker" to the left till we bump into a sentinel
                                                                
node.insertBefore(child, prevSentinel ? prevSentinel.nextSibling : 
node.firstChild);
@@ -2334,12 +2416,17 @@
                if ( selser === undefined ) {
                        // Clone the DOM if we are not in selser-mode
                        // since we will modify the DOM in preprocessDOM.
+                       //
+                       // XXX: Remove this when the DOM is no longer modified!
                        node = node.cloneNode(true);
                } else {
                        // In selser mode, cloning is not required since
                        // selser passes us a cloned DOM.
                        state.selser = selser;
                }
+
+               // Normalize the DOM (coalesces adjacent text nodes)
+               //node.normalize();
 
                // Preprocess DOM (collect tpl attr tags + strip empty white 
space)
                this.preprocessDOM(node, state, false, state.env.page.src);
@@ -2365,7 +2452,7 @@
                }
 
                this._serializeDOM( node, state );
-               this._serializeToken( state, new EOFTk() );
+               this._serializeToken( state, new pd.EOFTk() );
 
                if ( finalCB && typeof finalCB === 'function' ) {
                        finalCB();
@@ -2389,7 +2476,7 @@
 
 function gatherInlineText(buf, node) {
        switch (node.nodeType) {
-               case Node.ELEMENT_NODE:
+               case node.ELEMENT_NODE:
                        var name = node.nodeName.toLowerCase();
                        if (isHtmlBlockTag(name)) {
                                return;
@@ -2411,10 +2498,10 @@
                        }
 
                        return;
-               case Node.COMMENT_NODE:
+               case node.COMMENT_NODE:
                        buf.push("<--" + node.data + "-->");
                        return;
-               case Node.TEXT_NODE:
+               case node.TEXT_NODE:
                        buf.push(node.data);
                        return;
                default:
@@ -2423,37 +2510,166 @@
 }
 
 /**
+ * Helper for handleSeparator
+ *
+ * Collects, checks and integrates separator newline requirements to a sinple
+ * min, max structure.
+ */
+WSP.getSepConstraints = function(nodeA, sepNlsHandlerA, nodeB, sepNlsHandlerB) 
{
+       var constraints = { a:{}, b:{} }, bc;
+       if(sepNlsHandlerA) {
+               constraints.a = sepNlsHandlerA(nodeB);
+               constraints.min = constraints.a.min;
+               constraints.max = constraints.a.max;
+       }
+       if(sepNlsHandlerB) {
+               constraints.b = sepNlsHandlerB(nodeA);
+               var cb = constraints.b;
+
+
+               // now figure out if this conflicts with the constraints so far
+               if (cb.min !== undefined) {
+                       if (constraints.max !== undefined && constraints.max < 
cb.min) {
+                               // Conflict!
+                               return null;
+                       }
+                       constraints.min = Math.max(constraints.min || 0, 
cb.min);
+               }
+
+               if (cb.max !== undefined) {
+                       if (constraints.min !== undefined && constraints.min > 
cb.max) {
+                               // Conflict!
+                               return null;
+                       } else if (constraints.max !== undefined) {
+                               constraints.max = Math.min(constraints.max, 
cb.max);
+                       } else {
+                               constraints.max = cb.max;
+                       }
+               }
+       }
+
+       return constraints;
+};
+
+/**
+ * WIP, not used yet.
+ *
  * Create and emit separator wikitext between element nodes nodeA and nodeB.
  * If sepNode is not null, its content will be taken into account. It is
  * expected to be inter-element whitespace (check with DU.isIEW(node)).
  *
- * Uses the following start/end handler callbacks if defined:
- * startLines(separator, precedingElement) returning [min, max].
- * endLines(separator) returning min.
- *
  * node handlers:
+ *
  * node: {
  *     handle: function(state, node) {},
  *             // responsible for calling
- *     sep: function(node, axis) with axis in
- *             ['before', 'first-child', 'last-child', 'after']
- *             returning min or [min, max]
- *     sep: {
- *             before: function(node)
+ *     sepNls: {
+ *             before: function(node) -> {min: 1, max: 2}
  *             after: function(node)
- *             first_child: function(node)
- *             last_child: function(node)
+ *             firstChild: function(node)
+ *             lastChild: function(node)
  *     }
  * }
  *
  */
-WSP.handleSeparator = function( nodeA, handlerA, nodeB, handlerB, sepNode, 
state ) {
+WSP.handleSeparator = function( state, sep, cb, nodeA, handlerA, nodeB, 
handlerB) {
+       var constraints,
+               i;
+       var sepHandlerA = handlerA && handlerA.sepNls || {},
+               sepHandlerB = handlerB && handlerB.sepNls || {};
        if ( nodeB.parentNode === nodeA ) {
                // parent-child separator, nodeA parent of nodeB
+               constraints = this.getSepConstraints(nodeA, 
sepHandlerA.firstChild,
+                                                                               
        nodeB, sepHandlerB.before);
        } else if ( nodeA.parentNode === nodeB ) {
                // parent-child separator, nodeB parent of nodeA
+               constraints = this.getSepConstraints(nodeA, sepHandlerA.after,
+                                                                               
        nodeB, sepHandlerB.lastChild);
        } else {
                // sibling separator
+               constraints = this.getSepConstraints(nodeA, sepHandlerA.after,
+                                                                               
        nodeB, sepHandlerB.before);
+       }
+
+       console.log('constraints', constraints);
+
+       if (constraints === null) {
+               console.error('Conflicting separator requirements between ' + 
nodeA.nodeName +
+                               ' and ' + nodeB.nodeName);
+               cb('');
+       } else {
+               // Check if the sepNode conforms to the requirements, and 
adjust it if
+               // necessary. We assume that DU.isIEW(sepNode) returned true, 
or there
+               // was no sepNode.
+
+               var sepMatch = sep.match(/\n/g),
+                       sepNlCount = sepMatch && sepMatch.length || 0;
+               if (constraints.min && sepNlCount < constraints.min) {
+                       for (i = 0; i < (constraints.min - sepNlCount); i++) {
+                               sep += '\n';
+                       }
+               } else if (constraints.max && sepNlCount > constraints.match) {
+                       // strip some newlines from the end
+                       for (i = 0; i < (constraints.max - sepNlCount); i++) {
+                               sep = sep.replace(/[^\n]*\n[^\n]*$/, '');
+                       }
+               }
+
+               // XXX: Disabled for now- most line-based block elements move 
comments
+               // outside the DOM element, and still expect the comment to end 
up on
+               // the same line. Trailing spaces on a line don't trigger pres, 
so
+               // leave them in too in the interest of wt2wt round-tripping.
+               //if (constraints.a.min) {
+               //      // Strip leading non-nl ws up to the first newline, but 
keep comments
+               //      
sep.replace(/^([^\n<]*<!--(?:[^\-]+|-(?!->))*-->)?[^\n<]+/g, '$1');
+               //}
+
+               if (constraints.b.min) {
+                       // Strip trailing non-nl ws, but preserve comments
+                       // This avoids triggering pres
+                       sep = 
sep.replace(/[^\n>]+(<!--(?:[^\-]+|-(?!->))*-->[^\n]*)?$/g, '$1');
+               }
+
+               cb(sep);
+       }
+};
+
+/**
+ * Temporary: Test the DOM-based separator handler
+ *
+ * Called on a text node.
+ */
+WSP.testHandleSeparator = function(node, state) {
+       // Gather IEW nodes into string, and call handleSeparator
+       var maybeSeparator = this.gatherSeparatorText(node),
+               prev = node.previousSibling || node.parentNode,
+               prevHandler = WSP.tagHandlers[prev.nodeName.toLowerCase()],
+               cb = function(sep) {
+                       console.log('NEW:', prev.outerHTML + sep + 
next.outerHTML);
+               },
+               next, nextHandler,
+               sepSrc = '';
+       if (maybeSeparator) {
+               //console.log('<origsep>', maybeSeparator.sepSrc, '</origsep>');
+               // call handleSeparator with separator string
+               next = maybeSeparator.nextElement;
+               nextHandler = WSP.tagHandlers[next.nodeName.toLowerCase()];
+               sepSrc = maybeSeparator.sepSrc;
+       } else {
+               // call handleSeparator
+               next = node.nextSibling || node.parentNode;
+               nextHandler = WSP.tagHandlers[next.nodeName.toLowerCase()];
+       }
+
+       // Check if we actually need a separator
+       if ( prev !== next && //
+                       prev.nodeType === node.ELEMENT_NODE &&
+                       next.nodeType === node.ELEMENT_NODE )
+       {
+               // Looks like it.
+               this.handleSeparator(state, sepSrc, cb,
+                                                               prev, 
prevHandler,
+                                                               next, 
nextHandler);
        }
 };
 
@@ -2465,7 +2681,7 @@
 WSP._serializeDOM = function( node, state ) {
        var newNLs;
        // serialize this node
-       if (node.nodeType === Node.ELEMENT_NODE) {
+       if (node.nodeType === node.ELEMENT_NODE) {
                if (state.activeTemplateId &&
                        state.activeTemplateId === node.getAttribute("about"))
                {
@@ -2482,12 +2698,12 @@
                        var typeofVal = node.getAttribute("typeof");
                        if (typeofVal && 
typeofVal.match(/\bmw:Object(\/[^\s]+|\b)/)) {
                                state.activeTemplateId = 
node.getAttribute("about") || "";
-                               var attrs = [ new KV("typeof", 
"mw:TemplateSource") ];
+                               var attrs = [ new pd.KV("typeof", 
"mw:TemplateSource") ];
                                var dps = 
node.getAttribute("data-parsoid-serialize");
                                if (dps) {
-                                       attrs.push(new 
KV("data-parsoid-serialize", dps));
+                                       attrs.push(new 
pd.KV("data-parsoid-serialize", dps));
                                }
-                               var dummyToken = new SelfclosingTagTk("meta",
+                               var dummyToken = new pd.SelfclosingTagTk("meta",
                                        attrs,
                                        { src: this._getDOMRTInfo(node).src }
                                );
@@ -2499,14 +2715,14 @@
                                return;
                        }
                }
-       } else if (node.nodeType !== Node.COMMENT_NODE) {
+       } else if (node.nodeType !== node.COMMENT_NODE) {
                state.activeTemplateId = null;
        }
 
        var i, n, child, children;
 
        switch( node.nodeType ) {
-               case Node.ELEMENT_NODE:
+               case node.ELEMENT_NODE:
                        var nodeName = node.nodeName.toLowerCase(),
                                tkAttribs = 
this._getDOMAttribs(node.attributes),
                                tkRTInfo = this._getDOMRTInfo(node),
@@ -2536,7 +2752,7 @@
                                var dp = 
JSON.parse(node.getAttribute('data-parsoid') || '{}');
                                if ( dp.stx !== 'html' &&
                                        ! dp.tail &&
-                                       node.nextSibling && 
node.nextSibling.nodeType === Node.TEXT_NODE &&
+                                       node.nextSibling && 
node.nextSibling.nodeType === node.TEXT_NODE &&
                                        // TODO: use tokenizer
                                        node.nextSibling.nodeValue &&
                                        
node.nextSibling.nodeValue.match(/^[a-z]/) )
@@ -2555,7 +2771,7 @@
                        if (nodeName === 'pre' && tkRTInfo.stx === 'html') {
                                var modified = false;
                                var fc = node.firstChild;
-                               if (fc && fc.nodeType === Node.TEXT_NODE) {
+                               if (fc && fc.nodeType === node.TEXT_NODE) {
                                        var matches = 
fc.data.match(/^(\r\n|\r|\n)/);
                                        if (matches) {
                                                fc.insertData(0, matches[1]);
@@ -2565,7 +2781,7 @@
 
                                var strippedNL = tkRTInfo.strippedNL;
                                if (!modified && strippedNL) {
-                                       if (fc && fc.nodeType === 
Node.TEXT_NODE) {
+                                       if (fc && fc.nodeType === 
node.TEXT_NODE) {
                                                fc.insertData(0, strippedNL);
                                        } else {
                                                
node.insertBefore(node.ownerDocument.createTextNode(strippedNL), fc);
@@ -2577,6 +2793,7 @@
                        if ( state.selser.serializeInfo === null ) {
                                serializeInfo = node.data['parsoid-serialize'];
                                state.selser.serializeInfo = serializeInfo;
+                               state.chunkCB('', serializeInfo);
                        }
 
                        // See if we have a DOM-based handler for this node
@@ -2590,14 +2807,14 @@
                                                state.onStartOfLine = 
res.match(/\n$/) ? true : false;
                                        }
                                        stateCB(res, serializeInfo);
-                               }
+                               };
 
                                // DOM-based serialization
                                domHandler.handle(state, node, cbWrapper);
 
                                // Fake curToken state for token-based 
handlers. This is then
                                // assigned to prevToken in _serializeToken.
-                               state.curToken = new EndTagTk(nodeName, 
tkAttribs, tkRTInfo);
+                               state.curToken = new pd.EndTagTk(nodeName, 
tkAttribs, tkRTInfo);
                                state.prevToken = state.curToken;
                                state.currTagToken = state.curToken;
                                state.prevTagToken = state.curToken;
@@ -2605,7 +2822,7 @@
                                // Token-based serialization
 
                                // Serialize the start token
-                               var startToken = new TagTk(nodeName, tkAttribs, 
tkRTInfo);
+                               var startToken = new pd.TagTk(nodeName, 
tkAttribs, tkRTInfo);
                                this._serializeToken(state, startToken);
 
                                // Newly created elements/tags in this list 
inherit their default
@@ -2646,20 +2863,20 @@
 
                                        // Skip over comment, white-space text 
nodes, and tpl-content nodes
                                        var nodeType = child.nodeType;
-                                       if (  nodeType !== Node.COMMENT_NODE &&
-                                               !(nodeType === Node.TEXT_NODE 
&& child.data.match(/^\s*$/)) &&
-                                               !(nodeType === 
Node.ELEMENT_NODE &&
+                                       if (  nodeType !== node.COMMENT_NODE &&
+                                               !(nodeType === node.TEXT_NODE 
&& child.data.match(/^\s*$/)) &&
+                                               !(nodeType === 
node.ELEMENT_NODE &&
                                                        state.activeTemplateId 
&&
                                                        state.activeTemplateId 
=== child.getAttribute("about"))
                                                )
                                        {
-                                               if (child.nodeType === 
Node.ELEMENT_NODE) {
+                                               if (child.nodeType === 
node.ELEMENT_NODE) {
                                                        if (prevEltChild === 
null) {
                                                                if 
(!DU.hasNodeName(node, "pre")) {
                                                                        // 
extract separator text between node and child;
                                                                        
state.emitSeparator(node, child, START_SEP);
                                                                }
-                                                       } else if 
(prevEltChild.nodeType === Node.ELEMENT_NODE) {
+                                                       } else if 
(prevEltChild.nodeType === node.ELEMENT_NODE) {
                                                                if 
(!DU.hasNodeName(node, "pre")) {
                                                                        // 
extract separator text between prevEltChild and child;
                                                                        
state.emitSeparator(prevEltChild, child, IE_SEP);
@@ -2673,7 +2890,7 @@
                                        this._serializeDOM( children[i], state 
);
                                }
 
-                               if (prevEltChild && prevEltChild.nodeType === 
Node.ELEMENT_NODE) {
+                               if (prevEltChild && prevEltChild.nodeType === 
node.ELEMENT_NODE) {
                                        // extract separator text between 
prevEltChild and node
                                        if (!DU.hasNodeName(node, "pre")) {
                                                
state.emitSeparator(prevEltChild, node, END_SEP);
@@ -2686,7 +2903,7 @@
                                state.parentSTX = parentSTX;
 
                                // then the end token
-                               this._serializeToken(state, new 
EndTagTk(nodeName, tkAttribs, tkRTInfo));
+                               this._serializeToken(state, new 
pd.EndTagTk(nodeName, tkAttribs, tkRTInfo));
 
                                if ( tailSrc ) {
                                        // emit the tail
@@ -2699,7 +2916,7 @@
                        }
 
                        break;
-               case Node.TEXT_NODE:
+               case node.TEXT_NODE:
                        if (state.currLine.text === null) {
                                var buf = [],
                                        bn = firstBlockNodeAncestor(node);
@@ -2711,11 +2928,15 @@
                                state.currLine.numPieces = n;
                                state.currLine.text = buf.join('');
                        }
+
+                       // Test the separator handler, but don't use it yet.
+                       //this.testHandleSeparator(node, state);
+
                        this._serializeToken( state, node.data );
                        break;
-               case Node.COMMENT_NODE:
+               case node.COMMENT_NODE:
                        // delay the newline creation until after the comment
-                       this._serializeToken( state, new CommentTk( node.data ) 
);
+                       this._serializeToken( state, new pd.CommentTk( 
node.data ) );
                        break;
                default:
                        console.warn( "Unhandled node type: " +
diff --git a/js/lib/mediawiki.parser.defines.js 
b/js/lib/mediawiki.parser.defines.js
index 9f10525..ac27d4c 100644
--- a/js/lib/mediawiki.parser.defines.js
+++ b/js/lib/mediawiki.parser.defines.js
@@ -725,9 +725,20 @@
        }
 };
 
-// TODO: don't use globals!
 if (typeof module === "object") {
-       module.exports = {};
+       module.exports = {
+               TagTk: TagTk,
+               InternalTk: InternalTk,
+               EndTagTk: EndTagTk,
+               SelfclosingTagTk: SelfclosingTagTk,
+               NlTk: NlTk,
+               CommentTk: CommentTk,
+               EOFTk: EOFTk,
+               KV: KV,
+               Params: Params,
+               ParserValue: ParserValue
+       };
+       // TODO: don't use globals!
        global.TagTk = TagTk;
        global.InternalTk = InternalTk;
        global.EndTagTk = EndTagTk;

-- 
To view, visit https://gerrit.wikimedia.org/r/48974
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ie0aa7530918da9eb584f2039d49adceeb9f56f5f
Gerrit-PatchSet: 11
Gerrit-Project: mediawiki/extensions/Parsoid
Gerrit-Branch: master
Gerrit-Owner: GWicke <[email protected]>
Gerrit-Reviewer: GWicke <[email protected]>
Gerrit-Reviewer: Subramanya Sastry <[email protected]>
Gerrit-Reviewer: jenkins-bot

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Basic implementation of purely DOM-based separator handling - change (mediawiki...Parsoid)

Reply via email to