jenkins-bot has submitted this change and it was merged.

Change subject: Fetch article contributors
......................................................................


Fetch article contributors

Creates `authors.db` database using the `prop=contributors`
api in order to record article authorship information.

Co-authored-by: Max Semenik
Co-authored-by: C. Scott Ananian
Change-Id: Idee378f475c412df3be4d1224b8ceb9ea5531f8e
---
A lib/authors.js
M lib/index.js
2 files changed, 86 insertions(+), 2 deletions(-)

Approvals:
  Mwalker: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/lib/authors.js b/lib/authors.js
new file mode 100644
index 0000000..26a3a68
--- /dev/null
+++ b/lib/authors.js
@@ -0,0 +1,71 @@
+// Obtain authorship information for wiki articles.
+"use strict";
+var guard = require('when/guard');
+var Api = require('./api');
+
+// limit the # of concurrent image requests
+var AUTHORS_REQUEST_LIMIT = 5;
+
+var Authors = module.exports = function(wikis) {
+       this.wikis = wikis;
+       this.api = new Api(wikis);
+};
+
+/**
+ * Obtain the contributors list for a single article on a wiki.
+ */
+Authors.prototype.fetchMetadata = guard(
+       guard.n(AUTHORS_REQUEST_LIMIT),
+       function(wiki, title, status /* optional */)
+{
+       if (status) {
+               status.report(null, title + ' [metadata]');
+       }
+
+       return this.contributorsQuery(wiki, title, []).then(function(responses) 
{
+               var i, j, resp, pageid, numAnons = null, contributors = [];
+
+               for (i = 0; i < responses.length; i++) {
+                       resp = responses[i].query.pages;
+                       pageid = Object.keys(resp)[0];
+                       resp = resp[pageid];
+
+                       if (resp.anoncontributors) {
+                               numAnons = resp.anoncontributors;
+                       }
+
+                       for (j = 0; j < resp.contributors.length; j++) {
+                               contributors.push(resp.contributors[j].name);
+                       }
+               }
+
+               if (numAnons) {
+                       contributors.push('ANONIPEDITS:' + numAnons);
+               }
+               return contributors;
+       });
+});
+
+Authors.prototype.contributorsQuery = function(wiki, title, responses, 
prevResp) {
+       // XXX can we use revision id instead of title here?
+       var request = {
+               action: 'query',
+               prop: 'contributors',
+               titles: title,
+               continue: '',
+               pclimit: 5 //5000
+       };
+
+       if (prevResp) {
+               request.continue = prevResp.continue.continue;
+               request.pccontinue = prevResp.continue.pccontinue;
+       }
+
+       return this.api.request(wiki, request).then(function(resp) {
+               responses.push(resp);
+               if (resp.continue) {
+                       return this.contributorsQuery(wiki, title, responses, 
resp);
+               }
+               return responses;
+       }.bind(this));
+};
diff --git a/lib/index.js b/lib/index.js
index 5df431f..c62cd08 100644
--- a/lib/index.js
+++ b/lib/index.js
@@ -9,6 +9,7 @@
 var util = require('util');
 var when = require('when');
 
+var Authors = require('./authors');
 var Db = require('./db');
 var Html = require('./html');
 var Image = require('./image');
@@ -45,6 +46,7 @@
 
 
        var parsoid = new Parsoid(metabook.wikis);
+       var authors = new Authors(metabook.wikis);
        var html = new Html(metabook.wikis);
        var imageloader = new Image(metabook.wikis);
 
@@ -83,12 +85,13 @@
        // filled with all the parsoid sources.
        var fetchParsed = function() {
                status.createStage(
-                       // once for Parsoid, once for PHP parser, once for 
completion.
-                       3 * countItems(metabook),
+                       // 4 Tasks per item, fetch parsoid, fetch php, fetch 
metadata, mark complete
+                       4 * countItems(metabook),
                        'Fetching parsed articles'
                );
 
                var parsoidDb = new Db(path.join(options.output, "parsoid.db"));
+               var authorsDb = new Db(path.join(options.output, "authors.db"));
                var htmlDb = options.compat ?
                        new Db(path.join(options.output, "html.db")) : null;
                var max_redirects = options.follow ? MAX_REDIRECTS : 0;
@@ -119,6 +122,14 @@
                        }).then(function(result) {
                                return options.compat ? htmlDb.put(key, result) 
: null;
                        }).then(function() {
+                               // TODO: these queries should probably be 
batched
+                               return authors.fetchMetadata(item.wiki, 
item.title, status).then(function(result) {
+                                       authorsDb.put(
+                                               item.wiki ? (item.wiki + '|' + 
item.title) : item.title,
+                                               JSON.stringify(result)
+                                       );
+                               });
+                       }).then(function() {
                                status.report(null, util.format(
                                        '%s:%s [complete]',
                                        metabook.wikis[item.wiki].prefix, 
item.title
@@ -140,6 +151,8 @@
                return when.all(tasks).then(function() {
                        return parsoidDb.close();
                }).then(function() {
+                       return authorsDb.close();
+               }).then(function() {
                        return options.compat ? htmlDb.close() : null;
                });
        };

-- 
To view, visit https://gerrit.wikimedia.org/r/108033
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Idee378f475c412df3be4d1224b8ceb9ea5531f8e
Gerrit-PatchSet: 5
Gerrit-Project: mediawiki/extensions/Collection/OfflineContentGenerator/bundler
Gerrit-Branch: master
Gerrit-Owner: Mwalker <[email protected]>
Gerrit-Reviewer: Cscott <[email protected]>
Gerrit-Reviewer: MaxSem <[email protected]>
Gerrit-Reviewer: Mwalker <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to