Niedzielski has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/235395

Change subject: Add extract and thumbnail to lead response
......................................................................

Add extract and thumbnail to lead response

This patch adds a short extract and thumbnail to the lead response for
link previews. English extracts are supported but sentence parsing in
other locales is not tuned. Sentence fragmentation is expected to be
supplied by Parsoid or fixed in MediaWiki TextExtracts (T59669).

Bug: T108347
Change-Id: I2627b389b1ba2562bbf0aaac8b55b21d96a04743
---
A lib/extract.js
M lib/mwapi.js
M routes/mobile-html-sections.js
M spec.yaml
M test/features/mobile-html-sections-lead/pagecontent.js
5 files changed, 177 insertions(+), 2 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/mobileapps 
refs/changes/95/235395/1

diff --git a/lib/extract.js b/lib/extract.js
new file mode 100644
index 0000000..9aa0a53
--- /dev/null
+++ b/lib/extract.js
@@ -0,0 +1,114 @@
+'use strict';
+/**
+ Article extracts
+ */
+
+/**
+ * @param {string} [str]
+ * @return {string} str, less parenthetical expressions and their leading 
whitespace, if balanced.
+ */
+function removeParens(str) {
+    function count(paren) {
+        return ((str || '').match(new RegExp('\\' + paren, 'g')) || []).length;
+    }
+
+    var openCount = count('(');
+    var closeCount = count(')');
+    return openCount && openCount === closeCount
+         ? removeParens(str.replace(/\s*\([^()]*\)/g, ''))
+         : str;
+}
+
+/**
+ * Find all matches of regex in text, calling callback with each match object
+ *
+ * TODO: remove when switching to Parsoid. Copied from:
+ * 
https://github.com/wikimedia/mediawiki-services-cxserver/blob/0d21a808f7ab6b82086171af927467c1b9460626/lineardoc/Utils.js
+ *
+ * @param {string} text The text to search
+ * @param {Regex} regex The regex to search; should be created for this 
function call
+ * @param {Function} callback Function to call with each match
+ * @return {Array} The return values from the callback
+ */
+function findAll( text, regex, callback ) {
+    var match, boundary, boundaries = [];
+    while ( true ) {
+        match = regex.exec( text );
+        if ( match === null ) {
+            break;
+        }
+        boundary = callback( text, match );
+        if ( boundary !== null ) {
+            boundaries.push( boundary );
+        }
+    }
+    return boundaries;
+}
+
+/**
+ * Test a possible English sentence boundary match
+ *
+ * TODO: remove when switching to Parsoid. Copied from:
+ * 
https://github.com/wikimedia/mediawiki-services-cxserver/blob/0d21a808f7ab6b82086171af927467c1b9460626/segmentation/languages/SegmenterDefault.js
+ *
+ * @param {string} text The plaintext to segment
+ * @param {Object} match The possible boundary match (returned by regex.exec)
+ * @return {number|null} The boundary offset, or null if not a sentence 
boundary
+ */
+function findBoundary( text, match ) {
+    var tail, head, lastWord;
+
+    tail = text.slice( match.index + 1, text.length );
+    head = text.slice( 0, match.index );
+
+    // Trailing non-final punctuation: not a sentence boundary
+    if ( tail.match( /^[,;:]/ ) ) {
+        return null;
+    }
+    // Next word character is number or lower-case: not a sentence boundary
+    if ( tail.match( /^\W*[0-9a-z]/ ) ) {
+        return null;
+    }
+
+    // Do not break in abbreviations. Example D. John, St. Peter
+    lastWord = head.match( /(\w*)$/ )[ 0 ];
+    // Exclude at most 2 letter abbreviations. Examples: T. Dr. St. Jr. Sr. 
Ms. Mr.
+    // But not all caps like "UK." as in  "UK. Not US",
+    if ( lastWord.length <= 2 && lastWord.match( /^\W*[A-Z][a-z]?$/ ) && 
tail.match( /^\W*[A-Z]/ ) ) {
+        return null;
+    }
+
+    // Include any closing punctuation and trailing space
+    return match.index + 1 + tail.match( /^['”"’]*\s*/ )[ 0 ].length;
+}
+
+/**
+ * Find English sentence boundaries 
+ *
+ * TODO: remove when switching to Parsoid. Copied from:
+ * 
https://github.com/wikimedia/mediawiki-services-cxserver/blob/0d21a808f7ab6b82086171af927467c1b9460626/segmentation/languages/SegmenterDefault.js
+ *
+ * @param {string} text The plaintext to segment
+ * @returns {number[]} Sentence boundary offsets
+ */
+function getBoundaries( text ) {
+    // Regex to find possible English sentence boundaries.
+    // Must not use a shared regex instance (re.lastIndex is used)
+    return findAll( text, /[.!?]/g, findBoundary );
+}
+
+function format(extract) {
+    var MAX_SENTENCES = 2;
+    var cleanStr = removeParens(extract.replace(/\s+/g, ' '));
+    var boundaries = getBoundaries(cleanStr);
+    var cleanStrEndIndex = boundaries[Math.min(boundaries.length, 
MAX_SENTENCES - 1)];
+
+    var ret = cleanStr.slice(0, cleanStrEndIndex).trim();
+    if (ret !== '…' && ret !== '..') {
+        return ret;
+    }
+}
+
+module.exports = {
+    format: format
+};
\ No newline at end of file
diff --git a/lib/mwapi.js b/lib/mwapi.js
index c9c5495..86fb693 100644
--- a/lib/mwapi.js
+++ b/lib/mwapi.js
@@ -61,6 +61,29 @@
 }
 
 /**
+ *
+ * Requests extract and thumbnail information.
+ *
+ * @param {String} domain the domain to contact
+ * @return {Promise} a promise resolving as the HTML response
+ */
+function requestSummary(domain, title, width) {
+    return apiGet(domain, {
+            action: 'query',
+            //exsentences: 2, -- see T59669
+            exchars: 512,
+            explaintext: true,
+            format: 'json',
+            piprop: 'thumbnail|name',
+            pithumbsize: width ? width : 320,
+            prop: "extracts|pageimages|pageterms",
+            redirects: true,
+            titles: title,
+            wbptterms: 'description'
+        });
+}
+
+/**
  * Builds an array of URLs for lead images with different sizes based on 
common bucket widths: 640, 800, 1024.
  * @param initialUrl the initial URL for an actual lead image (caller already 
checked for undefined)
  *        example URL: 
//upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Cat_poster_1.jpg/640px-Cat_poster_1.jpg
@@ -90,5 +113,6 @@
     apiGet: apiGet,
     getAllSections: getAllSections,
     buildLeadImageUrls: buildLeadImageUrls,
-    checkForQueryPagesInResponse: checkForQueryPagesInResponse
+    checkForQueryPagesInResponse: checkForQueryPagesInResponse,
+    requestSummary: requestSummary
 };
diff --git a/routes/mobile-html-sections.js b/routes/mobile-html-sections.js
index 73ada98..61ac4f5 100644
--- a/routes/mobile-html-sections.js
+++ b/routes/mobile-html-sections.js
@@ -19,6 +19,7 @@
 var mwapi = require('../lib/mwapi');
 var gallery = require('../lib/gallery');
 var domino = require('domino');
+var extract = require('../lib/extract');
 
 // shortcut
 var HTTPError = sUtil.HTTPError;
@@ -82,8 +83,22 @@
     return out;
 }
 
+function parseSummary(body) {
+    var id = Object.keys(body.query.pages)[0];
+    var page = body.query.pages[id];
+    return {
+        extract: page && extract.format(page.extract),
+        thumbnail: page && page.thumbnail && {
+            url: page.thumbnail.source,
+            width: page.thumbnail.width,
+            height: page.thumbnail.height
+        }
+    };
+}
+
 function buildLead(input) {
     var lead = domino.createDocument(input.page.sections[0].text);
+    var summary = input.summary && parseSummary(input.summary.body) || {};
     return {
         id: input.page.id,
         revision: input.page.revision,
@@ -96,6 +111,9 @@
             file: input.page.image && input.page.image.file,
             urls: input.page.thumb && 
mwapi.buildLeadImageUrls(input.page.thumb.url)
         })),
+        extract: summary.extract,
+        // TODO: should we just add a lower resolution to the lead images 
array?
+        thumbnail: summary.thumbnail,
         media: input.media,
         infobox: transforms.parseInfobox(lead),
         pronunciation: transforms.parsePronunciation(lead),
@@ -137,7 +155,10 @@
 router.get('/mobile-html-sections-lead/:title', function (req, res) {
     return BBPromise.props({
         page: pageContentPromise(req.params.domain, req.params.title),
-        media: gallery.collectionPromise(req.logger, req.params.domain, 
req.params.title)
+        media: gallery.collectionPromise(req.logger, req.params.domain, 
req.params.title),
+        summary: mwapi.requestSummary(req.params.domain,
+                                      req.params.title,
+                                      req.query.width && 
parseInt(req.query.width))
     }).then(function (response) {
         res.status(200).json(buildLead(response)).end();
     });
diff --git a/spec.yaml b/spec.yaml
index 82c796c..0e0228f 100644
--- a/spec.yaml
+++ b/spec.yaml
@@ -116,6 +116,7 @@
           request:
             params:
               title: Main_Page
+              width: /\d+/
           response:
             status: 200
             headers:
@@ -147,6 +148,11 @@
               protection: /.+/
               editable: /.+/
               languagecount: /.+/
+              extract: /.+/
+              thumbnail:
+                url: /.+/
+                width: /\d+/
+                height: /\d+/
               media:
                 items: [ /.+/ ]
               infobox: [ [ /.+/ ] ]
diff --git a/test/features/mobile-html-sections-lead/pagecontent.js 
b/test/features/mobile-html-sections-lead/pagecontent.js
index fd6485a..464c647 100644
--- a/test/features/mobile-html-sections-lead/pagecontent.js
+++ b/test/features/mobile-html-sections-lead/pagecontent.js
@@ -71,6 +71,16 @@
                 assert.deepEqual(lead.geo.longitude, -122.417);
             });
     });
+    it('en San Francisco should have a lead object with an extract and 
thumbnail', function() {
+        return preq.get({ uri: server.config.uri + 
'en.wikipedia.org/v1/page/mobile-html-sections-lead/San_Francisco?width=100' })
+            .then(function(res) {
+                var lead = res.body;
+                assert.ok(lead.extract.length > 0);
+                assert.ok(lead.thumbnail.url.length > 0);
+                assert.deepEqual(lead.thumbnail.width, 100);
+                assert.ok(lead.thumbnail.height > 0);
+            });
+    });
     it('Obama (redirect) should have a lead image', function() {
         return preq.get({ uri: server.config.uri + 
'en.wikipedia.org/v1/page/mobile-html-sections-lead/Obama' })
             .then(function(res) {

-- 
To view, visit https://gerrit.wikimedia.org/r/235395
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I2627b389b1ba2562bbf0aaac8b55b21d96a04743
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/mobileapps
Gerrit-Branch: master
Gerrit-Owner: Niedzielski <sniedziel...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to