BearND has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/352963 )
Change subject: Add read-html route
......................................................................
Add read-html route
Now using parsoid-dom-utils to add <section> elements and split up the
DOM based on <section> elements. Note that <section> elements can be
nested but we stop compiling section text for the current section until
we hit the next sub-section.
Skip removing div.infobox since we'll later need that for parsing
spoken_Wikipedia.
Skip removing span.coordinates since they include span.geo, which
we'll later want to parse.
Added test for parseSpokenWikipedia.
Renamed test for parseInfobox.js to parse-infobox-test.js.
Change-Id: I4d408349f40f9289e63a6f6f3b955cf403f48fc3
---
D lib/parseSection.js
M lib/parsoid-access.js
M lib/transforms.js
M package.json
A routes/read-html.js
R test/lib/parseProperty/parse-infobox-test.js
A test/lib/parseProperty/parse-spoken-wikipedia-test.js
7 files changed, 114 insertions(+), 67 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/mobileapps
refs/changes/63/352963/1
diff --git a/lib/parseSection.js b/lib/parseSection.js
deleted file mode 100644
index 16934de..0000000
--- a/lib/parseSection.js
+++ /dev/null
@@ -1,25 +0,0 @@
-'use strict';
-
-function parse(sectionDiv, startingNode) {
- let nextNode;
- const nextSection = {};
- let node = startingNode;
-
- while (node) {
- if (!(/^H[2-6]$/.test(node.tagName))) {
- nextNode = node.nextSibling;
- sectionDiv.appendChild(node);
- node = nextNode;
- continue;
- } else {
- nextSection.toclevel = parseInt(node.tagName.charAt(1), 10) - 1;
- nextSection.line = node.innerHTML.trim();
- nextSection.anchor = node.id;
- node = node.nextSibling;
- break;
- }
- }
- return { sectionDiv, nextNode: node, nextSection };
-}
-
-module.exports = parse;
diff --git a/lib/parsoid-access.js b/lib/parsoid-access.js
index dda2ea7..81ca2ef 100644
--- a/lib/parsoid-access.js
+++ b/lib/parsoid-access.js
@@ -5,9 +5,10 @@
'use strict';
const domino = require('domino');
+const parsoidDomUtils = require('parsoid-dom-utils');
+
const sUtil = require('./util');
const api = require('./api-util');
-const parseSection = require('./parseSection');
const parseProperty = require('./parseProperty');
const parseDefinition = require('./parseDefinition');
const transforms = require('./transforms');
@@ -45,39 +46,26 @@
* @param {!document} doc the parsed DOM Document of the Parsoid output
*/
function addSectionDivs(doc) {
- // TODO: update once Parsoid emits section tags, see
https://phabricator.wikimedia.org/T114072#1711063
- let i = 0;
- let output;
- let sectionDiv;
- const node = doc.body.firstChild;
+ parsoidDomUtils.sections.wrap(doc);
+}
- sectionDiv = doc.createElement('div');
- sectionDiv.id = `section_${i}`;
- sectionDiv.className = 'toclevel_1';
- output = parseSection(sectionDiv, node);
- i++;
+function parseSection(startingNode) {
+ let sectionText = '';
+ let node = startingNode;
- if (output.nextNode) {
- doc.body.insertBefore(output.sectionDiv, output.nextNode);
- } else {
- doc.body.appendChild(output.sectionDiv);
- }
-
- while (output.nextNode) {
- const section = output.nextSection;
- sectionDiv = doc.createElement('div');
- sectionDiv.id = `section_${i}`;
- sectionDiv.className = `toclevel_${section.toclevel}`;
- sectionDiv.title = section.line;
- sectionDiv.setAttribute('data-anchor', section.anchor);
- output = parseSection(sectionDiv, output.nextNode);
- if (output.nextNode) {
- doc.body.insertBefore(output.sectionDiv, output.nextNode);
+ while (node) {
+ if (node.tagName !== 'SECTION') {
+ if (node.outerHTML) {
+ sectionText += node.outerHTML;
+ } else if (node.nodeType === 3) {
+ sectionText += node.textContent;
+ }
+ node = node.nextSibling;
} else {
- doc.body.appendChild(output.sectionDiv);
+ return sectionText;
}
- i++;
}
+ return sectionText;
}
/**
@@ -85,21 +73,26 @@
* @return {!sections[]} an array of section JSON elements
*/
function getSectionsText(doc) {
- // TODO: update once Parsoid emits section tags, see
https://phabricator.wikimedia.org/T114072#1711063
const sections = [];
- const sectionDivs = doc.querySelectorAll('div[id^=section]');
+ const sectionElements = doc.querySelectorAll('section');
- for (let i = 0; i < sectionDivs.length; i++) {
+ const currentSectionElement = sectionElements[0];
+ const currentSection = {};
+ currentSection.id = 0;
+ currentSection.text = currentSectionElement.innerHTML;
+ sections.push(currentSection);
+
+ for (let i = 1; i < sectionElements.length; i++) {
const currentSection = {};
- const currentSectionDiv = sectionDivs[i];
+ const currentSectionElement = sectionElements[i];
currentSection.id = i;
- currentSection.text = currentSectionDiv.innerHTML;
+ const childEl = currentSectionElement.firstChild;
- if (i !== 0) {
- const className = currentSectionDiv.className;
- currentSection.toclevel =
parseInt(className.substring('toclevel_'.length), 10);
- currentSection.line = currentSectionDiv.title;
- currentSection.anchor =
currentSectionDiv.getAttribute('data-anchor');
+ if (childEl && /^H[1-6]$/.test(childEl.tagName)) {
+ currentSection.text = parseSection(childEl.nextSibling);
+ currentSection.toclevel = parseInt(childEl.tagName.charAt(1), 10)
- 1;
+ currentSection.line = childEl.innerHTML.trim();
+ currentSection.anchor = childEl.getAttribute('id');
}
sections.push(currentSection);
@@ -188,6 +181,28 @@
});
}
+/**
+ * @param {!Object} app the application object
+ * @param {!Object} req the request object
+ * @param {?Boolean} [legacy] if enabled will apply additional transformations
+ * including a legacy version of relocation of first paragraph
+ * and hiding IPA via an inline style rather than clas.
+ * @return {!promise} Returns a promise to retrieve the page content from
Parsoid
+ */
+function pageHtmlPromise(app, req, legacy) {
+ return getParsoidHtml(app, req)
+ .then((response) => {
+ const meta = { etag: response.headers && response.headers.etag };
+ const doc = domino.createDocument(response.body);
+
+ transforms.stripUnneededMarkup(doc, legacy);
+ addSectionDivs(doc);
+
+ const html = doc.outerHTML;
+ return { meta, html };
+ });
+}
+
/*
* @param {!Object} app the application object
* @param {!Object} req the request object
@@ -229,6 +244,7 @@
module.exports = {
pageContentPromise,
+ pageHtmlPromise,
definitionPromise,
getParsoidHtml,
getRevisionFromEtag,
diff --git a/lib/transforms.js b/lib/transforms.js
index 823d98e..e05d5c4 100644
--- a/lib/transforms.js
+++ b/lib/transforms.js
@@ -221,9 +221,8 @@
'.geo-nondefault',
'.geo-multi-punct',
'.hide-when-compact',
- 'div.infobox',
- 'div.magnify',
- 'span#coordinates'
+ 'div.infobox:not(#section_SpokenWikipedia)', // Remove div.infobox
with exception
+ 'div.magnify'
// Would also like to use this but does not have any effect;
// https://github.com/fgnass/domino/issues/59
diff --git a/package.json b/package.json
index 6873502..163a5d8 100644
--- a/package.json
+++ b/package.json
@@ -50,6 +50,7 @@
"express": "^4.14.0",
"js-yaml": "^3.7.0",
"mediawiki-title": "^0.5.6",
+ "parsoid-dom-utils": "^0.1.3",
"preq": "^0.5.1",
"service-runner": "^2.2.5",
"swagger-router": "^0.5.5",
diff --git a/routes/read-html.js b/routes/read-html.js
new file mode 100644
index 0000000..22e74a8
--- /dev/null
+++ b/routes/read-html.js
@@ -0,0 +1,38 @@
+'use strict';
+
+const parsoid = require('../lib/parsoid-access');
+const sUtil = require('../lib/util');
+
+/**
+ * The main router object
+ */
+const router = sUtil.router();
+
+/**
+ * The main application object reported when this module is require()d
+ */
+let app;
+
+/**
+ * GET {domain}/v1/page/read-html/{title}/{revision?}/{tid?}
+ * Gets page content in HTML. This is based on Parsoid with some minor
modifications more
+ * suitable for the reading use cases.
+ */
+router.get('/read-html/:title/:revision?/:tid?', (req, res) => {
+ return parsoid.pageHtmlPromise(app, req, false)
+ .then((response) => {
+ res.status(200);
+ res.type('html');
+ res.set('etag', response.meta.etag);
+ res.send(response.html).end();
+ });
+});
+
+module.exports = function(appObj) {
+ app = appObj;
+ return {
+ path: '/page',
+ api_version: 1,
+ router
+ };
+};
diff --git a/test/lib/parseProperty/parseInfobox.js
b/test/lib/parseProperty/parse-infobox-test.js
similarity index 100%
rename from test/lib/parseProperty/parseInfobox.js
rename to test/lib/parseProperty/parse-infobox-test.js
diff --git a/test/lib/parseProperty/parse-spoken-wikipedia-test.js
b/test/lib/parseProperty/parse-spoken-wikipedia-test.js
new file mode 100644
index 0000000..cffdb23
--- /dev/null
+++ b/test/lib/parseProperty/parse-spoken-wikipedia-test.js
@@ -0,0 +1,18 @@
+'use strict';
+
+const fs = require('fs');
+const domino = require('domino');
+const assert = require('../../utils/assert.js');
+const parseProp = require('../../../lib/parseProperty');
+const html = fs.readFileSync(`${__dirname}/../bill-clinton.html`, 'utf-8');
+
+describe('lib:parseSpokenWikipedia', () => {
+ it('Parsed spoken Wikipedia should have correct number of
page.spoken.files', () => {
+ const doc = domino.createDocument(html);
+ const page = {};
+ parseProp.parseSpokenWikipedia(doc, page);
+ assert.deepEqual(page.spoken.files.length, 1);
+ assert.deepEqual(page.spoken.files[0], 'File:Bill Clinton (spoken
article).ogg');
+ });
+});
+
--
To view, visit https://gerrit.wikimedia.org/r/352963
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I4d408349f40f9289e63a6f6f3b955cf403f48fc3
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/mobileapps
Gerrit-Branch: master
Gerrit-Owner: BearND <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits