BearND has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/352963 )

Change subject: Add read-html route
......................................................................

Add read-html route

Now using parsoid-dom-utils to add <section> elements and split up the
DOM based on <section> elements. Note that <section> elements can be
nested but we stop compiling section text for the current section until
we hit the next sub-section.

Skip removing div.infobox since we'll later need that for parsing
spoken_Wikipedia.
Skip removing span.coordinates since they include span.geo, which
we'll later want to parse.

Added test for parseSpokenWikipedia.
Renamed test for parseInfobox.js to parse-infobox-test.js.

Change-Id: I4d408349f40f9289e63a6f6f3b955cf403f48fc3
---
D lib/parseSection.js
M lib/parsoid-access.js
M lib/transforms.js
M package.json
A routes/read-html.js
R test/lib/parseProperty/parse-infobox-test.js
A test/lib/parseProperty/parse-spoken-wikipedia-test.js
7 files changed, 114 insertions(+), 67 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/mobileapps 
refs/changes/63/352963/1

diff --git a/lib/parseSection.js b/lib/parseSection.js
deleted file mode 100644
index 16934de..0000000
--- a/lib/parseSection.js
+++ /dev/null
@@ -1,25 +0,0 @@
-'use strict';
-
-function parse(sectionDiv, startingNode) {
-    let nextNode;
-    const nextSection = {};
-    let node = startingNode;
-
-    while (node) {
-        if (!(/^H[2-6]$/.test(node.tagName))) {
-            nextNode = node.nextSibling;
-            sectionDiv.appendChild(node);
-            node = nextNode;
-            continue;
-        } else {
-            nextSection.toclevel = parseInt(node.tagName.charAt(1), 10) - 1;
-            nextSection.line = node.innerHTML.trim();
-            nextSection.anchor = node.id;
-            node = node.nextSibling;
-            break;
-        }
-    }
-    return { sectionDiv, nextNode: node, nextSection };
-}
-
-module.exports = parse;
diff --git a/lib/parsoid-access.js b/lib/parsoid-access.js
index dda2ea7..81ca2ef 100644
--- a/lib/parsoid-access.js
+++ b/lib/parsoid-access.js
@@ -5,9 +5,10 @@
 'use strict';
 
 const domino = require('domino');
+const parsoidDomUtils = require('parsoid-dom-utils');
+
 const sUtil = require('./util');
 const api = require('./api-util');
-const parseSection = require('./parseSection');
 const parseProperty = require('./parseProperty');
 const parseDefinition = require('./parseDefinition');
 const transforms = require('./transforms');
@@ -45,39 +46,26 @@
  * @param {!document} doc the parsed DOM Document of the Parsoid output
  */
 function addSectionDivs(doc) {
-    // TODO: update once Parsoid emits section tags, see 
https://phabricator.wikimedia.org/T114072#1711063
-    let i = 0;
-    let output;
-    let sectionDiv;
-    const node = doc.body.firstChild;
+    parsoidDomUtils.sections.wrap(doc);
+}
 
-    sectionDiv = doc.createElement('div');
-    sectionDiv.id = `section_${i}`;
-    sectionDiv.className = 'toclevel_1';
-    output = parseSection(sectionDiv, node);
-    i++;
+function parseSection(startingNode) {
+    let sectionText = '';
+    let node = startingNode;
 
-    if (output.nextNode) {
-        doc.body.insertBefore(output.sectionDiv, output.nextNode);
-    } else {
-        doc.body.appendChild(output.sectionDiv);
-    }
-
-    while (output.nextNode) {
-        const section = output.nextSection;
-        sectionDiv = doc.createElement('div');
-        sectionDiv.id = `section_${i}`;
-        sectionDiv.className = `toclevel_${section.toclevel}`;
-        sectionDiv.title = section.line;
-        sectionDiv.setAttribute('data-anchor', section.anchor);
-        output = parseSection(sectionDiv, output.nextNode);
-        if (output.nextNode) {
-            doc.body.insertBefore(output.sectionDiv, output.nextNode);
+    while (node) {
+        if (node.tagName !== 'SECTION') {
+            if (node.outerHTML) {
+                sectionText += node.outerHTML;
+            } else if (node.nodeType === 3) {
+                sectionText += node.textContent;
+            }
+            node = node.nextSibling;
         } else {
-            doc.body.appendChild(output.sectionDiv);
+            return sectionText;
         }
-        i++;
     }
+    return sectionText;
 }
 
 /**
@@ -85,21 +73,26 @@
  * @return {!sections[]} an array of section JSON elements
  */
 function getSectionsText(doc) {
-    // TODO: update once Parsoid emits section tags, see 
https://phabricator.wikimedia.org/T114072#1711063
     const sections = [];
-    const sectionDivs = doc.querySelectorAll('div[id^=section]');
+    const sectionElements = doc.querySelectorAll('section');
 
-    for (let i = 0; i < sectionDivs.length; i++) {
+    const currentSectionElement = sectionElements[0];
+    const currentSection = {};
+    currentSection.id = 0;
+    currentSection.text = currentSectionElement.innerHTML;
+    sections.push(currentSection);
+
+    for (let i = 1; i < sectionElements.length; i++) {
         const currentSection = {};
-        const currentSectionDiv = sectionDivs[i];
+        const currentSectionElement = sectionElements[i];
         currentSection.id = i;
-        currentSection.text = currentSectionDiv.innerHTML;
+        const childEl = currentSectionElement.firstChild;
 
-        if (i !== 0) {
-            const className = currentSectionDiv.className;
-            currentSection.toclevel = 
parseInt(className.substring('toclevel_'.length), 10);
-            currentSection.line = currentSectionDiv.title;
-            currentSection.anchor = 
currentSectionDiv.getAttribute('data-anchor');
+        if (childEl && /^H[1-6]$/.test(childEl.tagName)) {
+            currentSection.text = parseSection(childEl.nextSibling);
+            currentSection.toclevel = parseInt(childEl.tagName.charAt(1), 10) 
- 1;
+            currentSection.line = childEl.innerHTML.trim();
+            currentSection.anchor = childEl.getAttribute('id');
         }
 
         sections.push(currentSection);
@@ -188,6 +181,28 @@
         });
 }
 
+/**
+ * @param {!Object} app the application object
+ * @param {!Object} req the request object
+ * @param {?Boolean} [legacy] if enabled will apply additional transformations
+ * including a legacy version of relocation of first paragraph
+ * and hiding IPA via an inline style rather than clas.
+ * @return {!promise} Returns a promise to retrieve the page content from 
Parsoid
+ */
+function pageHtmlPromise(app, req, legacy) {
+    return getParsoidHtml(app, req)
+        .then((response) => {
+            const meta = { etag: response.headers && response.headers.etag };
+            const doc = domino.createDocument(response.body);
+
+            transforms.stripUnneededMarkup(doc, legacy);
+            addSectionDivs(doc);
+
+            const html = doc.outerHTML;
+            return { meta, html };
+        });
+}
+
 /*
  * @param {!Object} app the application object
  * @param {!Object} req the request object
@@ -229,6 +244,7 @@
 
 module.exports = {
     pageContentPromise,
+    pageHtmlPromise,
     definitionPromise,
     getParsoidHtml,
     getRevisionFromEtag,
diff --git a/lib/transforms.js b/lib/transforms.js
index 823d98e..e05d5c4 100644
--- a/lib/transforms.js
+++ b/lib/transforms.js
@@ -221,9 +221,8 @@
         '.geo-nondefault',
         '.geo-multi-punct',
         '.hide-when-compact',
-        'div.infobox',
-        'div.magnify',
-        'span#coordinates'
+        'div.infobox:not(#section_SpokenWikipedia)', // Remove div.infobox 
with exception
+        'div.magnify'
 
         // Would also like to use this but does not have any effect;
         // https://github.com/fgnass/domino/issues/59
diff --git a/package.json b/package.json
index 6873502..163a5d8 100644
--- a/package.json
+++ b/package.json
@@ -50,6 +50,7 @@
     "express": "^4.14.0",
     "js-yaml": "^3.7.0",
     "mediawiki-title": "^0.5.6",
+    "parsoid-dom-utils": "^0.1.3",
     "preq": "^0.5.1",
     "service-runner": "^2.2.5",
     "swagger-router": "^0.5.5",
diff --git a/routes/read-html.js b/routes/read-html.js
new file mode 100644
index 0000000..22e74a8
--- /dev/null
+++ b/routes/read-html.js
@@ -0,0 +1,38 @@
+'use strict';
+
+const parsoid = require('../lib/parsoid-access');
+const sUtil = require('../lib/util');
+
+/**
+ * The main router object
+ */
+const router = sUtil.router();
+
+/**
+ * The main application object reported when this module is require()d
+ */
+let app;
+
+/**
+ * GET {domain}/v1/page/read-html/{title}/{revision?}/{tid?}
+ * Gets page content in HTML. This is based on Parsoid with some minor 
modifications more
+ * suitable for the reading use cases.
+ */
+router.get('/read-html/:title/:revision?/:tid?', (req, res) => {
+    return parsoid.pageHtmlPromise(app, req, false)
+    .then((response) => {
+        res.status(200);
+        res.type('html');
+        res.set('etag', response.meta.etag);
+        res.send(response.html).end();
+    });
+});
+
+module.exports = function(appObj) {
+    app = appObj;
+    return {
+        path: '/page',
+        api_version: 1,
+        router
+    };
+};
diff --git a/test/lib/parseProperty/parseInfobox.js 
b/test/lib/parseProperty/parse-infobox-test.js
similarity index 100%
rename from test/lib/parseProperty/parseInfobox.js
rename to test/lib/parseProperty/parse-infobox-test.js
diff --git a/test/lib/parseProperty/parse-spoken-wikipedia-test.js 
b/test/lib/parseProperty/parse-spoken-wikipedia-test.js
new file mode 100644
index 0000000..cffdb23
--- /dev/null
+++ b/test/lib/parseProperty/parse-spoken-wikipedia-test.js
@@ -0,0 +1,18 @@
+'use strict';
+
+const fs = require('fs');
+const domino = require('domino');
+const assert = require('../../utils/assert.js');
+const parseProp = require('../../../lib/parseProperty');
+const html = fs.readFileSync(`${__dirname}/../bill-clinton.html`, 'utf-8');
+
+describe('lib:parseSpokenWikipedia', () => {
+    it('Parsed spoken Wikipedia should have correct number of 
page.spoken.files', () => {
+        const doc = domino.createDocument(html);
+        const page = {};
+        parseProp.parseSpokenWikipedia(doc, page);
+        assert.deepEqual(page.spoken.files.length, 1);
+        assert.deepEqual(page.spoken.files[0], 'File:Bill Clinton (spoken 
article).ogg');
+    });
+});
+

-- 
To view, visit https://gerrit.wikimedia.org/r/352963
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I4d408349f40f9289e63a6f6f3b955cf403f48fc3
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/mobileapps
Gerrit-Branch: master
Gerrit-Owner: BearND <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to