jenkins-bot has submitted this change and it was merged.
Change subject: Fetch article contributors
......................................................................
Fetch article contributors
Creates `authors.db` database using the `prop=contributors`
api in order to record article authorship information.
Co-authored-by: Max Semenik
Co-authored-by: C. Scott Ananian
Change-Id: Idee378f475c412df3be4d1224b8ceb9ea5531f8e
---
A lib/authors.js
M lib/index.js
2 files changed, 86 insertions(+), 2 deletions(-)
Approvals:
Mwalker: Looks good to me, approved
jenkins-bot: Verified
diff --git a/lib/authors.js b/lib/authors.js
new file mode 100644
index 0000000..26a3a68
--- /dev/null
+++ b/lib/authors.js
@@ -0,0 +1,71 @@
+// Obtain authorship information for wiki articles.
+"use strict";
+var guard = require('when/guard');
+var Api = require('./api');
+
+// limit the # of concurrent image requests
+var AUTHORS_REQUEST_LIMIT = 5;
+
+var Authors = module.exports = function(wikis) {
+ this.wikis = wikis;
+ this.api = new Api(wikis);
+};
+
+/**
+ * Obtain the contributors list for a single article on a wiki.
+ */
+Authors.prototype.fetchMetadata = guard(
+ guard.n(AUTHORS_REQUEST_LIMIT),
+ function(wiki, title, status /* optional */)
+{
+ if (status) {
+ status.report(null, title + ' [metadata]');
+ }
+
+ return this.contributorsQuery(wiki, title, []).then(function(responses)
{
+ var i, j, resp, pageid, numAnons = null, contributors = [];
+
+ for (i = 0; i < responses.length; i++) {
+ resp = responses[i].query.pages;
+ pageid = Object.keys(resp)[0];
+ resp = resp[pageid];
+
+ if (resp.anoncontributors) {
+ numAnons = resp.anoncontributors;
+ }
+
+ for (j = 0; j < resp.contributors.length; j++) {
+ contributors.push(resp.contributors[j].name);
+ }
+ }
+
+ if (numAnons) {
+ contributors.push('ANONIPEDITS:' + numAnons);
+ }
+ return contributors;
+ });
+});
+
+Authors.prototype.contributorsQuery = function(wiki, title, responses,
prevResp) {
+ // XXX can we use revision id instead of title here?
+ var request = {
+ action: 'query',
+ prop: 'contributors',
+ titles: title,
+ continue: '',
+ pclimit: 5 //5000
+ };
+
+ if (prevResp) {
+ request.continue = prevResp.continue.continue;
+ request.pccontinue = prevResp.continue.pccontinue;
+ }
+
+ return this.api.request(wiki, request).then(function(resp) {
+ responses.push(resp);
+ if (resp.continue) {
+ return this.contributorsQuery(wiki, title, responses,
resp);
+ }
+ return responses;
+ }.bind(this));
+};
diff --git a/lib/index.js b/lib/index.js
index 5df431f..c62cd08 100644
--- a/lib/index.js
+++ b/lib/index.js
@@ -9,6 +9,7 @@
var util = require('util');
var when = require('when');
+var Authors = require('./authors');
var Db = require('./db');
var Html = require('./html');
var Image = require('./image');
@@ -45,6 +46,7 @@
var parsoid = new Parsoid(metabook.wikis);
+ var authors = new Authors(metabook.wikis);
var html = new Html(metabook.wikis);
var imageloader = new Image(metabook.wikis);
@@ -83,12 +85,13 @@
// filled with all the parsoid sources.
var fetchParsed = function() {
status.createStage(
- // once for Parsoid, once for PHP parser, once for
completion.
- 3 * countItems(metabook),
+ // 4 Tasks per item, fetch parsoid, fetch php, fetch
metadata, mark complete
+ 4 * countItems(metabook),
'Fetching parsed articles'
);
var parsoidDb = new Db(path.join(options.output, "parsoid.db"));
+ var authorsDb = new Db(path.join(options.output, "authors.db"));
var htmlDb = options.compat ?
new Db(path.join(options.output, "html.db")) : null;
var max_redirects = options.follow ? MAX_REDIRECTS : 0;
@@ -119,6 +122,14 @@
}).then(function(result) {
return options.compat ? htmlDb.put(key, result)
: null;
}).then(function() {
+ // TODO: these queries should probably be
batched
+ return authors.fetchMetadata(item.wiki,
item.title, status).then(function(result) {
+ authorsDb.put(
+ item.wiki ? (item.wiki + '|' +
item.title) : item.title,
+ JSON.stringify(result)
+ );
+ });
+ }).then(function() {
status.report(null, util.format(
'%s:%s [complete]',
metabook.wikis[item.wiki].prefix,
item.title
@@ -140,6 +151,8 @@
return when.all(tasks).then(function() {
return parsoidDb.close();
}).then(function() {
+ return authorsDb.close();
+ }).then(function() {
return options.compat ? htmlDb.close() : null;
});
};
--
To view, visit https://gerrit.wikimedia.org/r/108033
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Idee378f475c412df3be4d1224b8ceb9ea5531f8e
Gerrit-PatchSet: 5
Gerrit-Project: mediawiki/extensions/Collection/OfflineContentGenerator/bundler
Gerrit-Branch: master
Gerrit-Owner: Mwalker <[email protected]>
Gerrit-Reviewer: Cscott <[email protected]>
Gerrit-Reviewer: MaxSem <[email protected]>
Gerrit-Reviewer: Mwalker <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits