Subramanya Sastry has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/201387

Change subject: WIP: T94599: <a> tags with invalid hrefs serialize to text
......................................................................

WIP: T94599: <a> tags with invalid hrefs serialize to text

* Mostly functional except for one edge case that leads to
  double nowiki-ing. Needs some refactoring to get it working.

Change-Id: Ifaaf2a399a1443556abf067a9a5fb430abbd02da
---
M lib/wts.LinkHandler.js
M tests/parserTests.txt
2 files changed, 56 insertions(+), 27 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid 
refs/changes/87/201387/1

diff --git a/lib/wts.LinkHandler.js b/lib/wts.LinkHandler.js
index 6cc6709..dcf0e25 100644
--- a/lib/wts.LinkHandler.js
+++ b/lib/wts.LinkHandler.js
@@ -239,25 +239,28 @@
        // First, entity-escape the content.
        linkTarget = Util.escapeEntities(linkTarget);
 
-       // Wikitext-escape content.
-       //
-       // When processing link text, we are no longer in newline state
-       // since that will be preceded by "[[" or "[" text in target wikitext.
-       state.onSOL = false;
-       
state.wteHandlerStack.push(state.serializer.wteHandlers.wikilinkHandler);
-       state.inLink = true;
-       var res = state.serializer.wteHandlers.escapeWikiText(state, 
linkTarget, { node: node });
-       state.inLink = false;
-       state.wteHandlerStack.pop();
-
+       // Is this an invalid link?
+       var invalidLink = false;
        if (!suppressLinkTest &&
                (!state.env.isValidLinkTarget(linkTarget) || 
/[\|]/.test(linkTarget)))
        {
-               linkTarget = "MediaWiki:Badtitletext";
-               state.env.log("error", "Bad title text", node.outerHTML);
+               invalidLink = true;
        }
 
-       return { contentSrc: res, linkTarget: linkTarget };
+       // Wikitext-escape content.
+       var res;
+       if (!invalidLink) {
+               // When processing link text, we are no longer in newline state
+               // since that will be preceded by "[[" or "[" text in target 
wikitext.
+               state.onSOL = false;
+               
state.wteHandlerStack.push(state.serializer.wteHandlers.wikilinkHandler);
+               state.inLink = true;
+               res = state.serializer.wteHandlers.escapeWikiText(state, 
linkTarget, { node: node });
+               state.inLink = false;
+               state.wteHandlerStack.pop();
+       }
+
+       return { contentSrc: res, linkTarget: linkTarget, invalidLink: 
invalidLink };
 };
 
 /**
@@ -370,8 +373,10 @@
 function serializeAsWikiLink(node, state, linkData, cb) {
        var contentParts,
                contentSrc = '',
+               isPiped = false,
                env = state.env,
                wiki = env.conf.wiki,
+               oldSOLState = state.onSOL,
                target = linkData.target,
                dp = DU.getDataParsoid(node);
 
@@ -465,8 +470,6 @@
                                }
                        }
                }
-               cb( new WikiLinkText( linkData.prefix + '[[' + linkTarget + 
']]', node, wiki, linkData.type ), node );
-               return;
        } else if ( isSimpleWikiLink(env, dp, target, linkData) ) {
                // Simple case
                if (!target.modified && !linkData.contentModified) {
@@ -482,9 +485,10 @@
                                linkTarget = ':' + linkTarget;
                        }
                }
-               cb( new WikiLinkText( linkData.prefix + '[[' + linkTarget + 
']]' + linkData.tail, node, wiki, linkData.type ), node );
-               return;
        } else {
+               // Emit piped wikilink syntax
+               isPiped = true;
+
                var usePT = usePipeTrick(env, dp, target, linkData);
 
                // First get the content source
@@ -526,17 +530,36 @@
                        if (!linkData.isInterwiki && !linkContentIsRelative) {
                                linkTarget = linkTarget.replace(/_/g, ' ');
                        }
-                       escapedRes = escapeWikiLinkContentString(linkTarget,
-                               state, node);
+                       escapedRes = escapeWikiLinkContentString(linkTarget, 
state, node);
                        linkTarget = escapedRes.linkTarget;
                }
                linkTarget = addColonEscape(env, linkTarget, linkData);
+       }
 
+       // If the link target was invalid, instead of emitting an invalid link,
+       // omit the link and serialize just the content instead. But, log the
+       // invalid html for Parsoid clients to investigate later.
+       var pipedText;
+       if (escapedRes && escapedRes.invalidLink) {
+               // Log it
+               state.env.log("error", "Bad title text", node.outerHTML);
+
+               // For non-piped content, use the original invalid link text
+               pipedText = isPiped ? contentSrc : linkTarget;
+
+               // Escape the text in the old sol context
+               //
+               // SSS FIXME: There is one scenario where contentSrc has already
+               // been escaped .. So, we have to detect that and eliminate 
double-escaping
+               state.onSOL = oldSOLState;
+               pipedText = state.serializer.wteHandlers.escapeWikiText(state, 
pipedText, { node: node });
+
+               cb(linkData.prefix + pipedText + linkData.tail, node);
+       } else {
+               pipedText = isPiped ? '|' + contentSrc : '';
                cb( new WikiLinkText(
-                       linkData.prefix +
-                       '[[' + linkTarget + '|' + contentSrc + ']]' +
-                       linkData.tail, node, wiki, linkData.type ), node );
-               return;
+                       linkData.prefix + '[[' + linkTarget + pipedText + ']]' 
+ linkData.tail,
+                       node, wiki, linkData.type ), node );
        }
 }
 
diff --git a/tests/parserTests.txt b/tests/parserTests.txt
index dc63a94..6b6d14d 100644
--- a/tests/parserTests.txt
+++ b/tests/parserTests.txt
@@ -5208,13 +5208,19 @@
 !! end
 
 !! test
-Replace invalid link targets when serializing
+Serialize <a> tags with invalid link targets as plain text
 !! options
 parsoid=html2wt
 !! html
-<a rel="mw:WikiLink" href="./]] foo [[bar">Manual</a>
+<a rel="mw:WikiLink" href="[[foo]]">text</a>
+<a rel="mw:WikiLink" href="[[foo]]">*text</a>
+<a rel="mw:WikiLink" href="[[foo]]">[[foo]]</a>
+<a rel="mw:WikiLink" href="[[foo]]">*a [[foo]]</a>
 !! wikitext
-[[MediaWiki:Badtitletext|Manual]]
+text
+<nowiki>*</nowiki>text
+<nowiki>[[foo]]</nowiki>
+<nowiki>*a [[foo]]</nowiki>
 !! end
 
 ###

-- 
To view, visit https://gerrit.wikimedia.org/r/201387
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ifaaf2a399a1443556abf067a9a5fb430abbd02da
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to