[MediaWiki-commits] [Gerrit] Add extract and thumbnail to lead response - change (mediawiki...mobileapps)

2015-09-03 Thread jenkins-bot (Code Review)
jenkins-bot has submitted this change and it was merged.

Change subject: Add extract and thumbnail to lead response
..


Add extract and thumbnail to lead response

This patch adds a short extract and thumbnail to the lead response for
link previews. English extracts are supported but sentence parsing in
other locales is not tuned. Sentence fragmentation is expected to be
supplied by Parsoid or fixed in MediaWiki TextExtracts (T59669).

Bug: T108347
Change-Id: I2627b389b1ba2562bbf0aaac8b55b21d96a04743
---
A lib/extract.js
M lib/mwapi.js
M routes/mobile-html-sections.js
M spec.yaml
M test/features/mobile-html-sections-lead/pagecontent.js
5 files changed, 153 insertions(+), 2 deletions(-)

Approvals:
  BearND: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/lib/extract.js b/lib/extract.js
new file mode 100644
index 000..0b98867
--- /dev/null
+++ b/lib/extract.js
@@ -0,0 +1,114 @@
+'use strict';
+/**
+ Article extracts
+ */
+
+/**
+ * @param {string} [str]
+ * @return {string} str, less parenthetical expressions and their leading 
whitespace, if balanced.
+ */
+function removeParens(str) {
+function count(paren) {
+return ((str || '').match(new RegExp('\\' + paren, 'g')) || []).length;
+}
+
+var openCount = count('(');
+var closeCount = count(')');
+return openCount && openCount === closeCount
+ ? removeParens(str.replace(/\s*\([^()]*\)/g, ''))
+ : str;
+}
+
+/**
+ * Find all matches of regex in text, calling callback with each match object
+ *
+ * TODO: remove when switching to Parsoid. Copied from:
+ * 
https://github.com/wikimedia/mediawiki-services-cxserver/blob/0d21a808f7ab6b82086171af927467c1b9460626/lineardoc/Utils.js
+ *
+ * @param {string} text The text to search
+ * @param {Regex} regex The regex to search; should be created for this 
function call
+ * @param {Function} callback Function to call with each match
+ * @return {Array} The return values from the callback
+ */
+function findAll( text, regex, callback ) {
+var match, boundary, boundaries = [];
+while ( true ) {
+match = regex.exec( text );
+if ( match === null ) {
+break;
+}
+boundary = callback( text, match );
+if ( boundary !== null ) {
+boundaries.push( boundary );
+}
+}
+return boundaries;
+}
+
+/**
+ * Test a possible English sentence boundary match
+ *
+ * TODO: remove when switching to Parsoid. Copied from:
+ * 
https://github.com/wikimedia/mediawiki-services-cxserver/blob/0d21a808f7ab6b82086171af927467c1b9460626/segmentation/languages/SegmenterDefault.js
+ *
+ * @param {string} text The plaintext to segment
+ * @param {Object} match The possible boundary match (returned by regex.exec)
+ * @return {number|null} The boundary offset, or null if not a sentence 
boundary
+ */
+function findBoundary( text, match ) {
+var tail, head, lastWord;
+
+tail = text.slice( match.index + 1, text.length );
+head = text.slice( 0, match.index );
+
+// Trailing non-final punctuation: not a sentence boundary
+if ( tail.match( /^[,;:]/ ) ) {
+return null;
+}
+// Next word character is number or lower-case: not a sentence boundary
+if ( tail.match( /^\W*[0-9a-z]/ ) ) {
+return null;
+}
+
+// Do not break in abbreviations. Example D. John, St. Peter
+lastWord = head.match( /(\w*)$/ )[ 0 ];
+// Exclude at most 2 letter abbreviations. Examples: T. Dr. St. Jr. Sr. 
Ms. Mr.
+// But not all caps like "UK." as in  "UK. Not US",
+if ( lastWord.length <= 2 && lastWord.match( /^\W*[A-Z][a-z]?$/ ) && 
tail.match( /^\W*[A-Z]/ ) ) {
+return null;
+}
+
+// Include any closing punctuation and trailing space
+return match.index + 1 + tail.match( /^['”"’]*\s*/ )[ 0 ].length;
+}
+
+/**
+ * Find English sentence boundaries
+ *
+ * TODO: remove when switching to Parsoid. Copied from:
+ * 
https://github.com/wikimedia/mediawiki-services-cxserver/blob/0d21a808f7ab6b82086171af927467c1b9460626/segmentation/languages/SegmenterDefault.js
+ *
+ * @param {string} text The plaintext to segment
+ * @returns {number[]} Sentence boundary offsets
+ */
+function getBoundaries( text ) {
+// Regex to find possible English sentence boundaries.
+// Must not use a shared regex instance (re.lastIndex is used)
+return findAll( text, /[.!?]/g, findBoundary );
+}
+
+function format(extract) {
+var MAX_SENTENCES = 2;
+var cleanStr = removeParens(extract.replace(/\s+/g, ' '));
+var boundaries = getBoundaries(cleanStr);
+var cleanStrEndIndex = boundaries[Math.min(boundaries.length, 
MAX_SENTENCES - 1)];
+
+var ret = cleanStr.slice(0, cleanStrEndIndex).trim();
+if (ret !== '…' && ret !== '..') {
+return ret;
+}
+}
+
+module.exports = {
+format: format
+};
\ No newline at end of file
diff --git a/lib/mwapi.js b/lib/mwapi.js
index 

[MediaWiki-commits] [Gerrit] Add extract and thumbnail to lead response - change (mediawiki...mobileapps)

2015-09-01 Thread Niedzielski (Code Review)
Niedzielski has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/235395

Change subject: Add extract and thumbnail to lead response
..

Add extract and thumbnail to lead response

This patch adds a short extract and thumbnail to the lead response for
link previews. English extracts are supported but sentence parsing in
other locales is not tuned. Sentence fragmentation is expected to be
supplied by Parsoid or fixed in MediaWiki TextExtracts (T59669).

Bug: T108347
Change-Id: I2627b389b1ba2562bbf0aaac8b55b21d96a04743
---
A lib/extract.js
M lib/mwapi.js
M routes/mobile-html-sections.js
M spec.yaml
M test/features/mobile-html-sections-lead/pagecontent.js
5 files changed, 177 insertions(+), 2 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/mobileapps 
refs/changes/95/235395/1

diff --git a/lib/extract.js b/lib/extract.js
new file mode 100644
index 000..9aa0a53
--- /dev/null
+++ b/lib/extract.js
@@ -0,0 +1,114 @@
+'use strict';
+/**
+ Article extracts
+ */
+
+/**
+ * @param {string} [str]
+ * @return {string} str, less parenthetical expressions and their leading 
whitespace, if balanced.
+ */
+function removeParens(str) {
+function count(paren) {
+return ((str || '').match(new RegExp('\\' + paren, 'g')) || []).length;
+}
+
+var openCount = count('(');
+var closeCount = count(')');
+return openCount && openCount === closeCount
+ ? removeParens(str.replace(/\s*\([^()]*\)/g, ''))
+ : str;
+}
+
+/**
+ * Find all matches of regex in text, calling callback with each match object
+ *
+ * TODO: remove when switching to Parsoid. Copied from:
+ * 
https://github.com/wikimedia/mediawiki-services-cxserver/blob/0d21a808f7ab6b82086171af927467c1b9460626/lineardoc/Utils.js
+ *
+ * @param {string} text The text to search
+ * @param {Regex} regex The regex to search; should be created for this 
function call
+ * @param {Function} callback Function to call with each match
+ * @return {Array} The return values from the callback
+ */
+function findAll( text, regex, callback ) {
+var match, boundary, boundaries = [];
+while ( true ) {
+match = regex.exec( text );
+if ( match === null ) {
+break;
+}
+boundary = callback( text, match );
+if ( boundary !== null ) {
+boundaries.push( boundary );
+}
+}
+return boundaries;
+}
+
+/**
+ * Test a possible English sentence boundary match
+ *
+ * TODO: remove when switching to Parsoid. Copied from:
+ * 
https://github.com/wikimedia/mediawiki-services-cxserver/blob/0d21a808f7ab6b82086171af927467c1b9460626/segmentation/languages/SegmenterDefault.js
+ *
+ * @param {string} text The plaintext to segment
+ * @param {Object} match The possible boundary match (returned by regex.exec)
+ * @return {number|null} The boundary offset, or null if not a sentence 
boundary
+ */
+function findBoundary( text, match ) {
+var tail, head, lastWord;
+
+tail = text.slice( match.index + 1, text.length );
+head = text.slice( 0, match.index );
+
+// Trailing non-final punctuation: not a sentence boundary
+if ( tail.match( /^[,;:]/ ) ) {
+return null;
+}
+// Next word character is number or lower-case: not a sentence boundary
+if ( tail.match( /^\W*[0-9a-z]/ ) ) {
+return null;
+}
+
+// Do not break in abbreviations. Example D. John, St. Peter
+lastWord = head.match( /(\w*)$/ )[ 0 ];
+// Exclude at most 2 letter abbreviations. Examples: T. Dr. St. Jr. Sr. 
Ms. Mr.
+// But not all caps like "UK." as in  "UK. Not US",
+if ( lastWord.length <= 2 && lastWord.match( /^\W*[A-Z][a-z]?$/ ) && 
tail.match( /^\W*[A-Z]/ ) ) {
+return null;
+}
+
+// Include any closing punctuation and trailing space
+return match.index + 1 + tail.match( /^['”"’]*\s*/ )[ 0 ].length;
+}
+
+/**
+ * Find English sentence boundaries 
+ *
+ * TODO: remove when switching to Parsoid. Copied from:
+ * 
https://github.com/wikimedia/mediawiki-services-cxserver/blob/0d21a808f7ab6b82086171af927467c1b9460626/segmentation/languages/SegmenterDefault.js
+ *
+ * @param {string} text The plaintext to segment
+ * @returns {number[]} Sentence boundary offsets
+ */
+function getBoundaries( text ) {
+// Regex to find possible English sentence boundaries.
+// Must not use a shared regex instance (re.lastIndex is used)
+return findAll( text, /[.!?]/g, findBoundary );
+}
+
+function format(extract) {
+var MAX_SENTENCES = 2;
+var cleanStr = removeParens(extract.replace(/\s+/g, ' '));
+var boundaries = getBoundaries(cleanStr);
+var cleanStrEndIndex = boundaries[Math.min(boundaries.length, 
MAX_SENTENCES - 1)];
+
+var ret = cleanStr.slice(0, cleanStrEndIndex).trim();
+if (ret !== '…' && ret !== '..') {
+return ret;
+}
+}
+
+module.exports = {
+format: format
+};
\ No newline at end of