jenkins-bot has submitted this change and it was merged.
Change subject: Create section divs before running JS transforms
......................................................................
Create section divs before running JS transforms
The JavaScript transforms used in the app are meant to be applied on a
section-by-
section basis, and having a means of isolating sections to perform the
transforms
before splitting into the final JSON structure is therefore necessary to prevent
bugs such as T65874.
This required substantial changes to the definition parsing code, since that was
very tightly coupled with the old way of parsing sections.
This patch also splits the transforms applied for every request into two
methods,
one for additive and one for destructive transforms. This is helpful because
otherwise it's too easy to unintentionally clobber markup we've just added.
Finally, lead paragraph shifting is disabled here so it can be adapted for the
new
section parsing code in a patch to follow.
Change-Id: I8c049740c390070c2be83b2efcc414c531761c9b
---
M lib/parseDefinition.js
M lib/parseSection.js
M lib/parsoid-access.js
M lib/transformations/relocateFirstParagraph.js
M lib/transforms.js
M test/lib/parsoid/parsoid-access-test.js
6 files changed, 128 insertions(+), 82 deletions(-)
Approvals:
BearND: Looks good to me, approved
jenkins-bot: Verified
diff --git a/lib/parseDefinition.js b/lib/parseDefinition.js
index 0a57a55..247d015 100644
--- a/lib/parseDefinition.js
+++ b/lib/parseDefinition.js
@@ -82,31 +82,17 @@
return languageList[langName];
}
-function initializeDefinitionSection(obj) {
- var result = obj;
- result.partOfSpeech = stripMarkup(obj.line);
- result.toclevel = result.line = result.anchor = undefined;
- result.definitions = [];
- return result;
-}
-
/**
* This is where the sausage is made. It seems no two Wiktionaries are
* formatted the same way, so we'll figure out where they (usually) keep the
* actual definitions on a wiki-by-wiki basis and pluck them out; this is
* usually at least *internally* consistent.
*/
-function getDefnList(output, wikiLangCode) {
- var defnList, ddList, doc;
-
- doc = domino.createDocument(output.text);
- transforms.inlineSpanText(doc);
- transforms.rmElementsWithSelector(doc, 'sup');
-
+function getDefnList(doc, id, wikiLangCode) {
+ var defnList;
switch (wikiLangCode) {
case 'en':
- transforms.rmElementsWithSelector(doc, 'ul');
- defnList = doc.querySelectorAll('li');
+ defnList = doc.querySelectorAll('div[id=' + id + '] li');
break;
//case 'fr':
// defnList = doc.querySelector('ol').children;
@@ -140,37 +126,38 @@
*/
function parse(doc, domain, title) {
// TODO: update once Parsoid emits section tags, see
https://phabricator.wikimedia.org/T114072#1711063
- var definitions = {},
+ var i, j,
+ definitions = {},
definitionSection,
- header,
- i,
- output,
defnList,
defnLangCode = null,
- currentLangSection,
- node = doc.body.firstChild,
- hasContent = false,
+ header,
+ sectionDivs = doc.querySelectorAll('div[id^="section_"]'),
+ currentSectionDiv,
wikiLangCode = domain.split('.')[0];
- output = parseSection(node);
+ // Language-specific transforms
+ if (wikiLangCode === 'en') {
+ transforms.rmElementsWithSelector(doc, 'ul');
+ }
- while (output.nextNode) {
- hasContent = true;
- header = stripMarkup(output.nextSection.line);
+ for (j = 0; j < sectionDivs.length; j++) {
+ currentSectionDiv = sectionDivs[j];
+ header = stripMarkup(currentSectionDiv.title);
- /* Get the language from the first H1 header, and begin iterating over
sections.
- Per the English Wiktionary style guide (linked in header above), H1
headings
+ /* Get the language from the first H2 header, and begin iterating over
sections.
+ Per the English Wiktionary style guide (linked in header above), H2
headings
are language names. */
- if (output.nextSection.toclevel === 1) {
+ if (currentSectionDiv.className.substring('toclevel_'.length) === "1")
{
defnLangCode = getLanguageCode(header, wikiLangCode);
}
/* Parse definitions from part-of-speech sections */
if (defnLangCode && partsOfSpeech[wikiLangCode].indexOf(header) > -1) {
- definitionSection =
initializeDefinitionSection(output.nextSection);
- output = parseSection(output.nextNode);
-
- defnList = getDefnList(output, wikiLangCode);
+ definitionSection = {};
+ definitionSection.partOfSpeech = header;
+ definitionSection.definitions = [];
+ defnList = getDefnList(doc, currentSectionDiv.id, wikiLangCode);
for (i = 0; i < defnList.length; i++) {
definitionSection.definitions.push(constructDefinition(defnList[i],
wikiLangCode));
}
@@ -180,9 +167,6 @@
}
definitions[defnLangCode].push(definitionSection);
- } else {
- //TODO: just update output with reference to next node; don't
needlessly "collect"
- output = parseSection(output.nextNode);
}
}
diff --git a/lib/parseSection.js b/lib/parseSection.js
index 38e3f3f..b9afcc7 100644
--- a/lib/parseSection.js
+++ b/lib/parseSection.js
@@ -3,29 +3,26 @@
var transforms = require('./transforms');
var a = require('./anchorencode');
-function parse(startingNode) {
+function parse(sectionDiv, startingNode) {
var node = startingNode,
- text = '',
+ nextNode,
nextSection = {};
while (node) {
- if (node.nodeType === transforms.NodeType.TEXT) {
- text = text + node.nodeValue;
- node = node.nextSibling;
+ if (!(/^H[2-6]$/.test(node.tagName))) {
+ nextNode = node.nextSibling;
+ sectionDiv.appendChild(node);
+ node = nextNode;
continue;
- } else if (/^H[2-6]$/.test(node.tagName)) { // heading tag
+ } else {
nextSection.toclevel = parseInt(node.tagName.charAt(1)) - 1;
nextSection.line = node.innerHTML.trim();
nextSection.anchor = a.anchorencode(nextSection.line);
node = node.nextSibling;
break;
}
-
- if (node.outerHTML) { // had some "undefined" values creeping into the
output without this check
- text = text + node.outerHTML;
- }
node = node.nextSibling;
}
- return { text: text.trim(), nextNode: node, nextSection: nextSection};
+ return { sectionDiv: sectionDiv, nextNode: node, nextSection: nextSection};
}
module.exports = parse;
diff --git a/lib/parsoid-access.js b/lib/parsoid-access.js
index d6df45d..15b4db6 100644
--- a/lib/parsoid-access.js
+++ b/lib/parsoid-access.js
@@ -6,14 +6,17 @@
var preq = require('preq');
var domino = require('domino');
+var a = require('./anchorencode');
var Template = require('swagger-router').Template;
var sUtil = require('../lib/util');
var mwapi = require('./mwapi');
var parseSection = require('./parseSection');
var parseProperty = require('./parseProperty');
var parseDefinition = require('./parseDefinition');
+var relocateFirstParagraph =
require('./transformations/relocateFirstParagraph');
var transforms = require('./transforms');
var HTTPError = sUtil.HTTPError;
+
var REDIRECT_REGEXP = /<link rel="mw:PageProp\/redirect" href="\.\/([^"]+)"/;
@@ -117,29 +120,69 @@
/**
* @param {document} doc the parsed DOM Document of the Parsoid output
- * @return {sections[]} an array of section JSON elements
+ * @return {document} the DOM document with added Section divs
*/
-function getSectionsText(doc) {
+function addSectionDivs(doc) {
// TODO: update once Parsoid emits section tags, see
https://phabricator.wikimedia.org/T114072#1711063
var sections = [],
section,
output,
i = 0,
+ sectionDiv,
node = doc.body.firstChild;
- output = parseSection(node);
- sections.push({
- "id": 0,
- "text": output.text
- });
+ sectionDiv = doc.createElement('div');
+ sectionDiv.id = 'section_' + i;
+ sectionDiv.className = 'toclevel_1';
+ output = parseSection(sectionDiv, node);
+ i++;
+
+ if (output.nextNode) {
+ doc.body.insertBefore(output.sectionDiv, output.nextNode);
+ } else {
+ doc.body.appendChild(output.sectionDiv);
+ }
while (output.nextNode) {
section = output.nextSection;
+ sectionDiv = doc.createElement('div');
+ sectionDiv.id = 'section_' + i;
+ sectionDiv.className = 'toclevel_' + section.toclevel;
+ sectionDiv.title = section.line;
+ output = parseSection(sectionDiv, output.nextNode);
+ if (output.nextNode) {
+ doc.body.insertBefore(output.sectionDiv, output.nextNode);
+ } else {
+ doc.body.appendChild(output.sectionDiv);
+ }
i++;
- output = parseSection(output.nextNode);
- section.id = i;
- section.text = output.text;
- sections.push(section);
+ }
+}
+
+/**
+ * @param {document} doc the parsed DOM Document of the Parsoid output
+ * @return {sections[]} an array of section JSON elements
+ */
+function getSectionsText(doc) {
+ // TODO: update once Parsoid emits section tags, see
https://phabricator.wikimedia.org/T114072#1711063
+ var sections = [],
+ currentSection,
+ sectionDivs = doc.querySelectorAll('div[id^=section]'),
+ currentSectionDiv;
+
+ for (var i = 0; i < sectionDivs.length; i++) {
+ currentSection = {};
+ currentSectionDiv = sectionDivs[i];
+ currentSection.id = i;
+ currentSection.text = currentSectionDiv.innerHTML;
+
+ if (i !== 0) {
+ currentSection.toclevel =
parseInt(currentSectionDiv.className.substring('toclevel_'.length));
+ currentSection.line = currentSectionDiv.title;
+ currentSection.anchor = a.anchorencode(currentSection.line);
+ }
+
+ sections.push(currentSection);
}
return sections;
@@ -178,10 +221,22 @@
.then(function (response) {
var page = { revision: getRevisionFromEtag(response.headers) };
var doc = domino.createDocument(response.body);
+
+ // Note: these properties must be obtained before stripping markup
page.lastmodified = getModified(doc);
parseProperty.parseGeo(doc, page);
parseProperty.parseSpokenWikipedia(doc, page);
- transforms.runParsoidDomTransforms(doc);
+
+ transforms.stripUnneededMarkup(doc);
+ addSectionDivs(doc);
+ transforms.addRequiredMarkup(doc);
+
+ // Move the first good paragraph up for any page except main pages.
+ // It's ok to do unconditionally since we throw away the page
+ // content if this turns out to be a main page.
+ //
+ // TODO: should we also exclude file and other special pages?
+ //relocateFirstParagraph(doc);
page.sections = getSectionsText(doc);
return page;
@@ -213,7 +268,11 @@
return getContent(app, req)
.then(function (response) {
var doc = domino.createDocument(response.body);
- transforms.runParsoidDomTransforms(doc);
+ transforms.stripUnneededMarkup(doc);
+ transforms.rmElementsWithSelector(doc, 'sup');
+ transforms.inlineSpanText(doc);
+ addSectionDivs(doc);
+ transforms.addRequiredMarkup(doc);
return parseDefinition(doc, req.params.domain, req.params.title);
});
}
@@ -223,6 +282,7 @@
definitionPromise: definitionPromise,
// VisibleForTesting
+ _addSectionDivs: addSectionDivs,
_getSectionsText: getSectionsText,
_getRedirectTitleFromLocationHeader: getRedirectTitleFromLocationHeader,
_hasRedirectInPayload: hasRedirectInPayload
diff --git a/lib/transformations/relocateFirstParagraph.js
b/lib/transformations/relocateFirstParagraph.js
index e2a2eb8..cf4b4fa 100644
--- a/lib/transformations/relocateFirstParagraph.js
+++ b/lib/transformations/relocateFirstParagraph.js
@@ -96,6 +96,4 @@
block_0.insertBefore( leadSpan, block_0.firstChild );
}
-module.exports = {
- moveFirstGoodParagraphUp: moveFirstGoodParagraphUp
-};
+module.exports = moveFirstGoodParagraphUp;
diff --git a/lib/transforms.js b/lib/transforms.js
index cde8d8a..3e2c773 100644
--- a/lib/transforms.js
+++ b/lib/transforms.js
@@ -7,7 +7,6 @@
var domino = require('domino');
var util = require('util');
-var relocateFirstParagraph =
require('./transformations/relocateFirstParagraph');
var anchorPopUpMediaTransforms =
require('./transformations/anchorPopUpMediaTransforms');
var hideRedLinks = require('./transformations/hideRedLinks');
var hideIPA = require('./transformations/hideIPA');
@@ -135,7 +134,7 @@
rmComments(doc);
}
-function applyParsoidSpecificTransformations(doc) {
+function addParsoidSpecificMarkup(doc) {
rewriteUrlAttribute(doc, 'a', 'href');
// Set <a class=\'external\"
@@ -148,9 +147,6 @@
// so the app knows to view these images in the gallery.
// See also
https://www.mediawiki.org/wiki/Parsoid/MediaWiki_DOM_spec#Images
addClassTo(doc, 'figure a, span[typeof^=mw:Image] a', 'image');
-
- // Remove reference link backs since we don't want them to appear in the
reference dialog; they are not needed.
- removeRefLinkbacks(doc);
}
function applyOptionalParsoidSpecificTransformations(doc) {
@@ -191,18 +187,22 @@
'span#coordinates'
// Would also like to use this but does not have any effect;
https://github.com/fgnass/domino/issues/59
-
//'span[style~="display:none"]', // Remove <span
style=\"display:none;\"> </span>
];
removeUnwantedElements(doc, rmSelectors);
- anchorPopUpMediaTransforms.fixVideoAnchor(doc);
- hideRedLinks.hideRedLinks(doc);
- hideIPA.hideIPA(doc);
- setMathFormulaImageMaxWidth.setMathFormulaImageMaxWidth(doc);
+ // Remove reference link backs since we don't want them to appear in the
reference dialog; they are not needed.
+ removeRefLinkbacks(doc);
}
+/**
+ * Destructive, non-Parsoid-specific transforms previously performed in the
app.
+ */
+function removeUnwantedWikiContentForApp(doc) {
+ hideRedLinks.hideRedLinks(doc);
+ hideIPA.hideIPA(doc);
+}
///**
// * Nukes stuff from the lead section we don't want.
@@ -219,16 +219,18 @@
/**
* Nukes stuff from the DOM we don't want for pages from Parsoid.
*/
-function runParsoidDomTransforms(doc) {
+function stripUnneededMarkup(doc) {
runAllSectionsTransforms(doc);
//runLeadSectionTransforms(doc);
- applyParsoidSpecificTransformations(doc);
applyOptionalParsoidSpecificTransformations(doc);
- // any page except main pages. It's ok to do unconditionally since we
throw away
- // the page content if this turns out to be a main page.
- // TODO: should we also exclude file and other special pages?
- relocateFirstParagraph.moveFirstGoodParagraphUp(doc);
+ removeUnwantedWikiContentForApp(doc);
+}
+
+function addRequiredMarkup(doc) {
+ addParsoidSpecificMarkup(doc);
+ setMathFormulaImageMaxWidth.setMathFormulaImageMaxWidth(doc);
+ anchorPopUpMediaTransforms.fixVideoAnchor(doc);
}
/**
@@ -245,7 +247,8 @@
module.exports = {
NodeType: NodeType,
- runParsoidDomTransforms: runParsoidDomTransforms,
+ stripUnneededMarkup: stripUnneededMarkup,
+ addRequiredMarkup: addRequiredMarkup,
runMainPageDomTransforms: runMainPageDomTransforms,
rmElementsWithSelector: rmElementsWithSelector,
inlineSpanText: inlineSpanText,
diff --git a/test/lib/parsoid/parsoid-access-test.js
b/test/lib/parsoid/parsoid-access-test.js
index 9332fde..45f84f0 100644
--- a/test/lib/parsoid/parsoid-access-test.js
+++ b/test/lib/parsoid/parsoid-access-test.js
@@ -34,6 +34,7 @@
it('getSectionsText(empty) should produce an empty lead section',
function() {
var doc = domino.createDocument('<body></body>');
+ parsoid._addSectionDivs(doc);
var sections = parsoid._getSectionsText(doc);
assert.deepEqual(sections.length, 1);
assert.deepEqual(sections[0].id, 0);
@@ -42,6 +43,7 @@
it('getSectionsText() with just text should produce a lead section',
function() {
var doc = domino.createDocument('<body>text0</body>');
+ parsoid._addSectionDivs(doc);
var sections = parsoid._getSectionsText(doc);
assert.deepEqual(sections.length, 1);
assertSection0(sections);
@@ -49,6 +51,7 @@
it('getSectionsText() with one h2 should produce two sections', function()
{
var doc = domino.createDocument('<body>text0<h2>foo</h2>text1</body>');
+ parsoid._addSectionDivs(doc);
var sections = parsoid._getSectionsText(doc);
assert.deepEqual(sections.length, 2);
assertSection0(sections);
@@ -57,6 +60,7 @@
it('getSectionsText() with one h2 and h3 should produce three sections',
function() {
var doc = domino.createDocument('<body>text0<h2>foo</h2>text1<h3
id="mwBa">Funny section !@#$%^&*()</h3>text2</body>');
+ parsoid._addSectionDivs(doc);
var sections = parsoid._getSectionsText(doc);
assert.deepEqual(sections.length, 3);
assertSection0(sections);
--
To view, visit https://gerrit.wikimedia.org/r/277511
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I8c049740c390070c2be83b2efcc414c531761c9b
Gerrit-PatchSet: 6
Gerrit-Project: mediawiki/services/mobileapps
Gerrit-Branch: master
Gerrit-Owner: Mholloway <[email protected]>
Gerrit-Reviewer: BearND <[email protected]>
Gerrit-Reviewer: Cscott <[email protected]>
Gerrit-Reviewer: Dbrant <[email protected]>
Gerrit-Reviewer: Fjalapeno <[email protected]>
Gerrit-Reviewer: GWicke <[email protected]>
Gerrit-Reviewer: Mholloway <[email protected]>
Gerrit-Reviewer: Mhurd <[email protected]>
Gerrit-Reviewer: Mobrovac <[email protected]>
Gerrit-Reviewer: Niedzielski <[email protected]>
Gerrit-Reviewer: Subramanya Sastry <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits