BearND has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/404247 )
Change subject: summary: just parse the lead section
......................................................................
summary: just parse the lead section
No need to try to parse other sections for the summary endpoint.
Change-Id: I02828100fd1a81c94e1a842c3264caba51c634ed
---
M lib/parsoidSections.js
M lib/parsoidSectionsUsingDivs.js
M lib/parsoidSectionsUsingSectionTags.js
M lib/summary.js
M routes/summary.js
M test/lib/parsoid/parsoid-sections-div-element-test.js
M test/lib/parsoid/parsoid-sections-section-elements-tests.js
7 files changed, 69 insertions(+), 16 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/mobileapps
refs/changes/47/404247/1
diff --git a/lib/parsoidSections.js b/lib/parsoidSections.js
index f8e21d6..f785685 100644
--- a/lib/parsoidSections.js
+++ b/lib/parsoidSections.js
@@ -38,8 +38,23 @@
}
}
+/**
+ * Reduces the input Parsoid document to a new Document containing
+ * just the first section.
+ * @param {!document} doc the parsed DOM Document of the Parsoid output
+ * @return {!Document} a new DOM Document containing only the lead section
+ */
+function justLeadSection(doc) {
+ if (!hasParsoidSections(doc)) {
+ return parsoidSectionsUsingDivs.justLeadSection(doc);
+ } else {
+ return parsoidSectionsUsingSectionTags.justLeadSection(doc);
+ }
+}
+
module.exports = {
hasParsoidSections,
addSectionDivs,
- getSectionsText
+ getSectionsText,
+ justLeadSection
};
diff --git a/lib/parsoidSectionsUsingDivs.js b/lib/parsoidSectionsUsingDivs.js
index a5886dd..5d11837 100644
--- a/lib/parsoidSectionsUsingDivs.js
+++ b/lib/parsoidSectionsUsingDivs.js
@@ -1,11 +1,11 @@
'use strict';
/*
- This is an old way of doing sectioning, which is preserved only for the
definitions endpoint.
- TODO: We should move the definition parsing implementation to the new way
- (parsoidSectionsUsingSectionTags.js) soon since when Parsoid deploys adding
<section> tags the
- definitions code may be negatively affected by that.
+ This is an old way of doing sectioning, which is preserved only just in case
+ we encounter a payload from an older Parsoid version.
*/
+
+const domino = require('domino');
function parseNextSection(sectionDiv, startingNode) {
let nextNode;
@@ -96,7 +96,18 @@
return sections;
}
+/**
+ * @param {!document} doc the parsed DOM Document of the Parsoid output
+ * @return {!Document} a new DOM Document containing only the lead section
+ */
+function justLeadSection(doc) {
+ const sectionDiv = doc.createElement('div');
+ parseNextSection(sectionDiv, doc.body.firstChild);
+ return domino.createDocument(sectionDiv.innerHTML);
+}
+
module.exports = {
addSectionDivs,
- getSectionsText
+ getSectionsText,
+ justLeadSection
};
diff --git a/lib/parsoidSectionsUsingSectionTags.js
b/lib/parsoidSectionsUsingSectionTags.js
index a30d537..aab9cd3 100644
--- a/lib/parsoidSectionsUsingSectionTags.js
+++ b/lib/parsoidSectionsUsingSectionTags.js
@@ -1,5 +1,6 @@
'use strict';
+const domino = require('domino');
const parsoidDomUtils = require('parsoid-dom-utils');
const NodeType = require('./nodeType');
@@ -18,7 +19,7 @@
/**
* Gets the section number from Parsoid.
- * @param {!DOMElement} sectionElement a <section> DOM element
+ * @param {!Element} sectionElement a <section> DOM element
* @return {int} the section number as reported by Parsoid
*/
function getSectionNumber(sectionElement) {
@@ -66,7 +67,7 @@
* In this case, starts a new section object when a <section> tag is
encountered that is a
* direct descendant
* @param {!Logger} logger the app's bunyan logger
- * @param {!DOMNode} node the node to visit
+ * @param {!Node} node the node to visit
* @param {!Object[]} allSections the array containing the results
* @param {!Object} state some state to pass around
*/
@@ -103,7 +104,7 @@
* descending so we don't duplicate content since elsewhere outerHTML is used
(which contains
* content of sub-nodes).
* @param {!Logger} logger the app's bunyan logger
- * @param {!DOMElement} rootElement the root of the DOM tree which needs to be
traversed
+ * @param {!Element} rootElement the root of the DOM tree which needs to be
traversed
* @param {!Object[]} allSections holds the results
*/
function traverseDF(logger, rootElement, allSections) {
@@ -130,9 +131,19 @@
return allSections;
}
+/**
+ * @param {!document} doc the parsed DOM Document of the Parsoid output
+ * @return {!Document} a new DOM Document containing only the lead section
+ */
+function justLeadSection(doc) {
+ const currentSectionElement =
doc.querySelector('section[data-mw-section-id]');
+ return domino.createDocument(currentSectionElement.innerHTML);
+}
+
module.exports = {
addSectionTags,
getSectionsText,
+ justLeadSection,
testing: {
parseSections: traverseDF,
shouldLogInvalidSectionWarning,
diff --git a/lib/summary.js b/lib/summary.js
index 6e18df5..37e5450 100644
--- a/lib/summary.js
+++ b/lib/summary.js
@@ -77,10 +77,9 @@
* @param {!String} html page content and metadata from Parsoid
* @param {!Object} revTid revision and tid from Parsoid
* @param {!Object} meta metadata from MW API
- * @param {!Logger} logger a bunyan logger
* @return {!Object} a summary 2.0 spec-compliant page summary object
*/
-function buildSummary(domain, title, html, revTid, meta, logger) {
+function buildSummary(domain, title, html, revTid, meta) {
const isContentModelWikitext = meta.contentmodel === 'wikitext';
const isWhiteListedNamespace = SUMMARY_NS_WHITELIST.includes(meta.ns);
const isRedirect = meta.redirect;
@@ -98,10 +97,7 @@
}
const doc = domino.createDocument(html);
- parsoidSections.addSectionDivs(doc);
- const sections = parsoidSections.getSectionsText(doc, logger);
-
- const leadSectionDoc = domino.createDocument(sections[0].text);
+ const leadSectionDoc = parsoidSections.justLeadSection(doc);
const intro = transforms.extractLeadIntroduction(leadSectionDoc);
const summary = intro.length ? transforms.summarize(intro) : { extract:
'', extract_html: '' };
diff --git a/routes/summary.js b/routes/summary.js
index 07577e3..9bd89af 100644
--- a/routes/summary.js
+++ b/routes/summary.js
@@ -32,7 +32,7 @@
const revTid = parsoid.getRevAndTidFromEtag(response.html.headers);
const title = Title.newFromText(req.params.title, response.siteinfo);
const summary = lib.buildSummary(req.params.domain, title,
- response.html.body, revTid, response.meta, req.logger);
+ response.html.body, revTid, response.meta);
res.status(summary.code);
if (summary.code === 200) {
delete summary.code;
diff --git a/test/lib/parsoid/parsoid-sections-div-element-test.js
b/test/lib/parsoid/parsoid-sections-div-element-test.js
index 04622b9..df407ba 100644
--- a/test/lib/parsoid/parsoid-sections-div-element-test.js
+++ b/test/lib/parsoid/parsoid-sections-div-element-test.js
@@ -92,4 +92,14 @@
assertSection0(sections);
assertSection1(sections, extraText);
});
+
+ describe('justLeadSection', () => {
+ it('should just return the first section', () => {
+ const doc = domino.createDocument('<body><p>text0</p>' +
+ '<h2 id="foo">foo</h2>text1' +
+ '</body>');
+ const leadSectionDoc = parsoid.justLeadSection(doc);
+ assert.deepEqual(leadSectionDoc.body.innerHTML, '<p>text0</p>');
+ });
+ });
});
diff --git a/test/lib/parsoid/parsoid-sections-section-elements-tests.js
b/test/lib/parsoid/parsoid-sections-section-elements-tests.js
index 36d8644..c05eb79 100644
--- a/test/lib/parsoid/parsoid-sections-section-elements-tests.js
+++ b/test/lib/parsoid/parsoid-sections-section-elements-tests.js
@@ -167,4 +167,14 @@
detail: 'Cannot find heading for section number 1.'
}]]);
});
+
+ describe('justLeadSection', () => {
+ it('should just return the first section', () => {
+ const doc = domino.createDocument(
+ '<section data-mw-section-id="0"><p>text0</p></section>' +
+ '<section data-mw-section-id="1"><h2
id="foo">foo</h2>text1</section>');
+ const leadSectionDoc =
parsoidSectionsUsingSectionTags.justLeadSection(doc);
+ assert.deepEqual(leadSectionDoc.body.innerHTML, '<p>text0</p>');
+ });
+ });
});
--
To view, visit https://gerrit.wikimedia.org/r/404247
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I02828100fd1a81c94e1a842c3264caba51c634ed
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/mobileapps
Gerrit-Branch: master
Gerrit-Owner: BearND <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits