Mobrovac has submitted this change and it was merged.
Change subject: Change how pubmed and pmcs are requested
......................................................................
Change how pubmed and pmcs are requested
The PubMed ID converted API was frequently
not returning results for valid PMIDs, so
instead we request metadata from pubmed
URLs directly.
CitoidService
* Remove requestFromPubMed, which used
the ID converter API to get a DOI
* Replace with requestFromPMCID and PMID
which concatenates id onto a PubMed URL,
verifies the server sends a 200 ok
response code, and sends to
requestFromPubMedURL.
* Add requestFromPubMedURL which sends
pubmed urls to Zotero first, and to
native scraper second. If the scraper
detects an HTTP error, it returns 404,
otherwise it scrapes the page.
Tests
* Convert pmid test for to a PMID which has
no results in the converter API but has a
valid URL.
* Add test for PMC with PMC prefix
* Add test for PMC without PMC prefix
* Add test for invalid PMCID
* Add test for invald PMID
Bug: T93335
Change-Id: Ie2830d42fa63fecd30db72f723e6d5d9979f51b5
---
M lib/CitoidService.js
M package.json
M test/features/errors/index.js
M test/features/scraping/index.js
4 files changed, 202 insertions(+), 49 deletions(-)
Approvals:
Mobrovac: Verified; Looks good to me, approved
diff --git a/lib/CitoidService.js b/lib/CitoidService.js
index c8aadcc..05280b5 100644
--- a/lib/CitoidService.js
+++ b/lib/CitoidService.js
@@ -21,6 +21,7 @@
* @param {Object} statsd metrics object
*/
function CitoidService(citoidConfig, logger, statsd) {
+
this.logger = logger;
this.zoteroService = new ZoteroService(citoidConfig, logger);
this.scraper = new Scraper(citoidConfig, logger);
@@ -53,11 +54,12 @@
* @param {Function} callback callback (error, statusCode, body)
*/
CitoidService.prototype.requestFromURL = function (opts, callback) {
- var self = this,
- logger = self.logger,
- zoteroWebRequest =
self.zoteroService.zoteroWebRequest.bind(self.zoteroService),
- requestedURL = opts.search,
- format = opts.format;
+
+ var self = this;
+ var logger = self.logger;
+ var zoteroWebRequest =
self.zoteroService.zoteroWebRequest.bind(self.zoteroService);
+ var requestedURL = opts.search;
+ var format = opts.format;
zoteroWebRequest(requestedURL, format, function(error, response, body) {
logger.log('debug/zotero', "Zotero request made for: " +
requestedURL);
@@ -120,6 +122,7 @@
* @param {Function} callback callback (error, statusCode, body)
*/
CitoidService.prototype.requestFromDOI = function (doiOpts, callback){
+
var doiLink = 'http://dx.doi.org/'+ doiOpts.search;
var citoidService = this;
// Shallow clone doiOpts for requestFromURL method
@@ -141,33 +144,142 @@
"; Sending to requestFromURL");
citoidService.requestFromURL(urlOpts, callback);
} else {
- citoidService.logger.log('debug/DOI', "Unable to
resolve DOI "
- + doiOpts.search);
- var message = 'Unable to resolve DOI';
+ var message = 'Unable to resolve DOI ' + doiOpts.search;
var error = new Error(message);
+ citoidService.logger.log('debug/DOI', message);
callback(error, 404, {Error: message});
}
});
};
/**
- * Request citation metadata from a PubMed identifier. Supports PMID, PMCID,
Manuscript ID and versioned identifiers
- * @param {Object} opts options object containing PubMed identifier.
PMCID identifiers must begin with 'PMC'
+ * Requests citation metadata from a PMID or PMCID identifier.
+ * @param {String} type 'PMID' or 'PMCID'
+ * @param {Object} opts options object containing a PM(C)ID
* @param {Function} callback callback (error, statusCode, body)
*/
-CitoidService.prototype.requestFromPubMedID = function(opts, callback){
- var citoidService = this;
- pubMedRequest(opts.search, this.logger, function(error, obj){
- if(error) {
- callback(error, null, null);
+CitoidService.prototype.requestFromPM = function(type, opts, callback){
+
+ var self = this;
+ var urlOpts = Object.assign({}, opts); // shallow clone of opts
+ var baseURL = 'http://www.ncbi.nlm.nih.gov/';
+
+ switch(type) {
+ case 'PMID':
+ baseURL = baseURL + 'pubmed/';
+ break;
+ case 'PMCID':
+ baseURL = baseURL + 'pmc/articles/';
+ break;
+ default:
+ var e = new Error('Unknown PubMed type: ' + type);
+ self.logger.log('warn/pubmed', e);
+ callback(e, 400, {Error: e.message});
+ return;
+ }
+
+ urlOpts.search = baseURL + opts.search;
+
+ self.logger.log('debug/pubmed', {from: opts.search, to: urlOpts.search,
+ type: type});
+
+ self.requestFromPubMedURL(urlOpts, function(err, responseCode, body){
+ if (responseCode === 200){
+ callback(err, 200, body);
} else {
- var doi = obj.records[0].doi;
- citoidService.logger.log('debug/pubmed', "Got DOI " +
doi);
- opts.search = doi;
- citoidService.requestFromDOI(opts, callback);
+ // Handle case where 404 is due to failed export
+ if (responseCode === 404 && body) {
+ callback(err, 404, body);
+ // For all other errors, override message
+ } else {
+ var e = new Error('Unable to locate resource
with ' + type + ' ' +
+ opts.search);
+ self.logger.log('info/pubmed', e.message);
+ callback(e, 404, {Error: e.message});
+ }
+ }
+ });
+
+};
+
+/**
+ * Request citation metadata from a pubmed url. Should be called by
+ * requestFromPM
+ * @param {Object} opts options object containing pubmed url
+ * @param {Function} callback callback (error, statusCode, body)
+ */
+CitoidService.prototype.requestFromPubMedURL = function(opts, callback){
+
+ var self = this;
+ var logger = self.logger;
+ var zoteroWebRequest =
self.zoteroService.zoteroWebRequest.bind(self.zoteroService);
+ var requestedURL = opts.search;
+ var format = opts.format;
+
+ function scrapeAndCheck() {
+ self.scrape(opts, function(error, responseCode, body){
+ // Return 404s instead of 520s for invalid pubmed links
+ if (responseCode === 200){
+ callback(error, 200, body);
+ } else if (responseCode === 404) {
+ // 404s here are due to a failed export and
+ // the error and body should be passed through.
+ callback(error, 404, body);
+ } else {
+ // Overwrite body of any 520 response from
scraper
+ callback(error, 404, null);
+ }
+ });
+ }
+
+ zoteroWebRequest(requestedURL, format, function(error, response, body) {
+ logger.log('debug/zotero', "Zotero request made for: " +
requestedURL);
+ if (error) {
+ logger.log('warn/zotero', error);
+ self.stats.increment('zotero.req.error');
+ scrapeAndCheck();
+ } else if (!response) {
+ callback(true, 404, null);
+ } else {
+ self.stats.increment('zotero.req.' +
Math.floor(response.statusCode / 100)
+ + 'xx');
+
+ var citationInBody = Array.isArray(response.body) &&
response.body[0];
+ var okBody = response.statusCode === 200 &&
citationInBody;
+ var notImplemented = response.statusCode === 501;
+
+ if (okBody) {
+ logger.log('debug/zotero', "Successfully
retrieved and translated\
+ body from Zotero");
+ callback (null, 200, body);
+ } else {
+
+ // 501 not implemented can indicate that the
link is invalid, or
+ // that the translator is broken. Translators
can break at any
+ // time if the website's url structure is
changed.
+ if (notImplemented){
+ logger.log('trace/zotero', "Zotero not
implemented error; \
+ PubMed ID may be invalid");
+ }
+
+ // Zotero ideally should return 501 for http
errors at the
+ // location, but the PubMed translator is
currently broken and
+ // returns 200 and empty body for http errors
for PMCs only
+ if (!citationInBody){
+ logger.log('trace/zotero', "Zotero
returned 200 but with \
+ empty body; PubMed ID may be invalid");
+ }
+
+ if (!citationInBody || notImplemented) {
+ scrapeAndCheck();
+ } else {
+ callback(true, 404, null);
+ }
+ }
}
});
};
+
/**
* Determine type of string (doi, url) and callback on correct handler
@@ -175,24 +287,21 @@
* @param {Function} callback callback(extractedValue,
correctFunction)
*/
CitoidService.prototype.distinguish = function(rawSearchInput, callback){
- var reDOI, rePMID, rePMCID, rePMCID2, reHTTP, reWWW,
- parsedURL,
- matchDOI, matchPMID, matchPMCID, matchHTTP, matchWWW,
- search = rawSearchInput.trim();
- reHTTP = new RegExp('^((https?)://.+\\..+)'); // Assumes all strings
with http/s protocol are URLs
- reWWW = new RegExp('^((www)\\..+\\..+)'); // Assumes all strings with
www substring are URLs
- reDOI = new RegExp('\\b10\\.?[0-9]{3,4}(?:[.][0-9]+)*/.*');
- rePMID = new RegExp('^\\d{8}\\b');
- rePMCID = new RegExp('\\bPMC\\d{7}\\b');
- rePMCID2 = new RegExp('^\\d{7}\\b');
+ var search = rawSearchInput.trim();
- matchHTTP = search.match(reHTTP);
- matchDOI = search.match(reDOI);
- matchPMID = search.match(rePMID);
- matchPMCID = search.match(rePMCID);
- matchWWW = search.match(reWWW);
+ var reHTTP = new RegExp('^((https?)://.+\\..+)'); // Assumes all
strings with http/s protocol are URLs
+ var reWWW = new RegExp('^((www)\\..+\\..+)'); // Assumes all strings
with www substring are URLs
+ var reDOI = new RegExp('\\b10\\.?[0-9]{3,4}(?:[.][0-9]+)*/.*');
+ var rePMID = new RegExp('^\\d{8}\\b');
+ var rePMCID = new RegExp('\\bPMC\\d{7}\\b');
+ var rePMCID2 = new RegExp('^\\d{7}\\b');
+ var matchHTTP = search.match(reHTTP);
+ var matchDOI = search.match(reDOI);
+ var matchPMID = search.match(rePMID);
+ var matchPMCID = search.match(rePMCID);
+ var matchWWW = search.match(reWWW);
if (matchHTTP || matchWWW){
this.stats.increment('input.url');
@@ -202,18 +311,18 @@
callback(matchDOI[0], this.requestFromDOI.bind(this));
} else if (matchPMID) {
this.stats.increment('input.pmid');
- callback(matchPMID[0], this.requestFromPubMedID.bind(this));
+ callback(matchPMID[0], this.requestFromPM.bind(this, 'PMID'));
} else if (matchPMCID) {
this.stats.increment('input.pmcid');
- callback(matchPMCID[0], this.requestFromPubMedID.bind(this));
+ callback(matchPMCID[0], this.requestFromPM.bind(this, 'PMCID'));
} else {
- matchPMCID = search.match(rePMCID2);
+ matchPMCID = search.match(rePMCID2); // Detects PMCIDs with no
PMC prefix
if (matchPMCID) {
this.stats.increment('input.pmcid');
- callback('PMC' + matchPMCID[0],
this.requestFromPubMedID.bind(this));
+ callback('PMC' + matchPMCID[0],
this.requestFromPM.bind(this, 'PMCID'));
} else {
this.stats.increment('input.url');
- parsedURL = urlParse.parse(search);
+ var parsedURL = urlParse.parse(search);
if (!parsedURL.protocol){
search = 'http://'+ search;
}
@@ -228,15 +337,19 @@
* @param {Function} callback callback(error, responseCode, citation)
*/
CitoidService.prototype.scrape = function(opts, callback){
- var cbResCode, // Response Code to send to the callback
- format = opts.format,
- scrape = this.scraper.scrape.bind(this.scraper),
- zoteroExportRequest =
this.zoteroService.zoteroExportRequest.bind(this.zoteroService);
+ var cbResCode; // Response Code to send to the callback
+ var self = this;
+ var format = opts.format;
+ var scrape = this.scraper.scrape.bind(this.scraper);
+ var zoteroExportRequest =
this.zoteroService.zoteroExportRequest.bind(this.zoteroService);
+
if (format === 'bibtex') {
scrape(opts, function(error, scrapeResCode, citation) {
zoteroExportRequest(citation[0], format, function(err,
zotResCode, body) {
- if (zotResCode !== 200){
- body = "Unable to serve "+ format +"
format at this time";
+ if (err || zotResCode !== 200){
+ self.logger.log('trace/zotero', "Unable
to translate to export format bibtex");
+ var message = "Unable to serve " +
format + " format at this time";
+ body = {Error: message};
cbResCode = 404; // 404 error if cannot
translate into alternate format
} else if (scrapeResCode !== 200){
cbResCode = 520; // 520 error if the
scraper couldn't scrape from url
diff --git a/package.json b/package.json
index 1707ab2..e283dd8 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
{
"name": "citoid",
- "version": "0.2.4",
+ "version": "0.2.5",
"description": "Converts search terms such as URL or DOI into citations.",
"scripts": {
"start": "service-runner",
diff --git a/test/features/errors/index.js b/test/features/errors/index.js
index b997ed9..b56e120 100644
--- a/test/features/errors/index.js
+++ b/test/features/errors/index.js
@@ -77,7 +77,32 @@
assert.status(res, 404);
}, function(err) {
assert.status(err, 404);
- assert.deepEqual(err.body.Error, 'Unable to resolve
DOI',
+ assert.deepEqual(err.body.Error, 'Unable to resolve DOI
' + doi,
+ 'Unexpected error message ' + err.body.Error);
+ });
+ });
+
+ it('bad pmid', function() {
+ var pmid = '99999999';
+ return server.query(pmid, 'mediawiki', 'en')
+ .then(function(res) {
+ assert.status(res, 404);
+ }, function(err) {
+ assert.status(err, 404);
+ assert.deepEqual(err.body.Error,
+ 'Unable to locate resource with PMID ' + pmid,
+ 'Unexpected error message ' + err.body.Error);
+ });
+ });
+
+ it('bad pmcid', function() {
+ var pmcid = 'PMC9999999';
+ return server.query(pmcid, 'mediawiki', 'en')
+ .then(function(res) {
+ assert.status(res, 404);
+ }, function(err) {
+ assert.status(err, 404);
+ assert.deepEqual(err.body.Error, 'Unable to locate
resource with PMCID ' + pmcid,
'Unexpected error message ' + err.body.Error);
});
});
diff --git a/test/features/scraping/index.js b/test/features/scraping/index.js
index ff2e333..5538399 100644
--- a/test/features/scraping/index.js
+++ b/test/features/scraping/index.js
@@ -12,8 +12,23 @@
before(function () { return server.start(); });
- it('pmid', function() {
- return server.query('23555203').then(function(res) {
+ //PMID on NIH website that is not found in the id converter api
+ it('pmid (not in id converter)', function() {
+ return server.query('14656957').then(function(res) {
+ assert.status(res, 200);
+ assert.checkCitation(res, 'Seventh report of the Joint
National Committee on Prevention, Detection, Evaluation, and Treatment of High
Blood Pressure');
+ });
+ });
+
+ it('pmcid with prefix', function() {
+ return server.query('PMC3605911').then(function(res) {
+ assert.status(res, 200);
+ assert.checkCitation(res, 'Viral Phylodynamics');
+ });
+ });
+
+ it('pmcid without prefix', function() {
+ return server.query('3605911').then(function(res) {
assert.status(res, 200);
assert.checkCitation(res, 'Viral Phylodynamics');
});
--
To view, visit https://gerrit.wikimedia.org/r/200314
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ie2830d42fa63fecd30db72f723e6d5d9979f51b5
Gerrit-PatchSet: 4
Gerrit-Project: mediawiki/services/citoid
Gerrit-Branch: master
Gerrit-Owner: Mvolz <[email protected]>
Gerrit-Reviewer: Mobrovac <[email protected]>
Gerrit-Reviewer: Mvolz <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits