Mobrovac has submitted this change and it was merged.

Change subject: Change how pubmed and pmcs are requested
......................................................................


Change how pubmed and pmcs are requested

The PubMed ID converted API was frequently
not returning results for valid PMIDs, so
instead we request metadata from pubmed
URLs directly.

CitoidService

* Remove requestFromPubMed, which used
the ID converter API to get a DOI
* Replace with requestFromPMCID and PMID
which concatenates id onto a PubMed URL,
verifies the server sends a 200 ok
response code, and sends to
requestFromPubMedURL.
* Add requestFromPubMedURL which sends
pubmed urls to Zotero first, and to
native scraper second. If the scraper
detects an HTTP error, it returns 404,
otherwise it scrapes the page.

Tests

* Convert pmid test for to a PMID which has
no results in the converter API but has a
valid URL.
* Add test for PMC with PMC prefix
* Add test for PMC without PMC prefix
* Add test for invalid PMCID
* Add test for invald PMID

Bug: T93335
Change-Id: Ie2830d42fa63fecd30db72f723e6d5d9979f51b5
---
M lib/CitoidService.js
M package.json
M test/features/errors/index.js
M test/features/scraping/index.js
4 files changed, 202 insertions(+), 49 deletions(-)

Approvals:
  Mobrovac: Verified; Looks good to me, approved



diff --git a/lib/CitoidService.js b/lib/CitoidService.js
index c8aadcc..05280b5 100644
--- a/lib/CitoidService.js
+++ b/lib/CitoidService.js
@@ -21,6 +21,7 @@
  * @param {Object} statsd      metrics object
  */
 function CitoidService(citoidConfig, logger, statsd) {
+
        this.logger = logger;
        this.zoteroService = new ZoteroService(citoidConfig, logger);
        this.scraper = new Scraper(citoidConfig, logger);
@@ -53,11 +54,12 @@
  * @param  {Function} callback   callback (error, statusCode, body)
  */
 CitoidService.prototype.requestFromURL = function (opts, callback) {
-       var self = this,
-               logger = self.logger,
-               zoteroWebRequest = 
self.zoteroService.zoteroWebRequest.bind(self.zoteroService),
-               requestedURL = opts.search,
-               format = opts.format;
+
+       var self = this;
+       var logger = self.logger;
+       var zoteroWebRequest = 
self.zoteroService.zoteroWebRequest.bind(self.zoteroService);
+       var requestedURL = opts.search;
+       var format = opts.format;
 
        zoteroWebRequest(requestedURL, format, function(error, response, body) {
                logger.log('debug/zotero', "Zotero request made for: " + 
requestedURL);
@@ -120,6 +122,7 @@
  * @param  {Function} callback   callback (error, statusCode, body)
  */
 CitoidService.prototype.requestFromDOI = function (doiOpts, callback){
+
        var doiLink = 'http://dx.doi.org/'+ doiOpts.search;
        var citoidService = this;
        // Shallow clone doiOpts for requestFromURL method
@@ -141,33 +144,142 @@
                                "; Sending to requestFromURL");
                        citoidService.requestFromURL(urlOpts, callback);
                } else {
-                       citoidService.logger.log('debug/DOI', "Unable to 
resolve DOI "
-                               + doiOpts.search);
-                       var message = 'Unable to resolve DOI';
+                       var message = 'Unable to resolve DOI ' + doiOpts.search;
                        var error = new Error(message);
+                       citoidService.logger.log('debug/DOI', message);
                        callback(error, 404, {Error: message});
                }
        });
 };
 
 /**
- * Request citation metadata from a PubMed identifier. Supports PMID, PMCID, 
Manuscript ID and versioned identifiers
- * @param  {Object}   opts       options object containing PubMed identifier. 
PMCID identifiers must begin with 'PMC'
+ * Requests citation metadata from a PMID or PMCID identifier.
+ * @param  {String}   type       'PMID' or 'PMCID'
+ * @param  {Object}   opts       options object containing a PM(C)ID
  * @param  {Function} callback   callback (error, statusCode, body)
  */
-CitoidService.prototype.requestFromPubMedID = function(opts, callback){
-       var citoidService = this;
-       pubMedRequest(opts.search, this.logger, function(error, obj){
-               if(error) {
-                       callback(error, null, null);
+CitoidService.prototype.requestFromPM = function(type, opts, callback){
+
+       var self = this;
+       var urlOpts =  Object.assign({}, opts); // shallow clone of opts
+       var baseURL = 'http://www.ncbi.nlm.nih.gov/';
+
+       switch(type) {
+               case 'PMID':
+                       baseURL = baseURL + 'pubmed/';
+                       break;
+               case 'PMCID':
+                       baseURL = baseURL + 'pmc/articles/';
+                       break;
+               default:
+                       var e = new Error('Unknown PubMed type: ' + type);
+                       self.logger.log('warn/pubmed', e);
+                       callback(e, 400, {Error: e.message});
+                       return;
+       }
+
+       urlOpts.search = baseURL + opts.search;
+
+       self.logger.log('debug/pubmed', {from: opts.search, to: urlOpts.search,
+               type: type});
+
+       self.requestFromPubMedURL(urlOpts, function(err, responseCode, body){
+               if (responseCode === 200){
+                       callback(err, 200, body);
                } else {
-                       var doi = obj.records[0].doi;
-                       citoidService.logger.log('debug/pubmed', "Got DOI " + 
doi);
-                       opts.search = doi;
-                       citoidService.requestFromDOI(opts, callback);
+                       // Handle case where 404 is due to failed export
+                       if (responseCode === 404 && body) {
+                               callback(err, 404, body);
+                       // For all other errors, override message
+                       } else {
+                               var e = new Error('Unable to locate resource 
with ' + type + ' ' +
+                                       opts.search);
+                               self.logger.log('info/pubmed', e.message);
+                               callback(e, 404, {Error: e.message});
+                       }
+               }
+       });
+
+};
+
+/**
+ * Request citation metadata from a pubmed url. Should be called by
+ * requestFromPM
+ * @param  {Object}   opts       options object containing pubmed url
+ * @param  {Function} callback   callback (error, statusCode, body)
+ */
+CitoidService.prototype.requestFromPubMedURL = function(opts, callback){
+
+       var self = this;
+       var logger = self.logger;
+       var zoteroWebRequest = 
self.zoteroService.zoteroWebRequest.bind(self.zoteroService);
+       var requestedURL = opts.search;
+       var format = opts.format;
+
+       function scrapeAndCheck() {
+               self.scrape(opts, function(error, responseCode, body){
+                       // Return 404s instead of 520s for invalid pubmed links
+                       if (responseCode === 200){
+                               callback(error, 200, body);
+                       } else if (responseCode === 404) {
+                               // 404s here are due to a failed export and
+                               // the error and body should be passed through.
+                               callback(error, 404, body);
+                       } else {
+                               // Overwrite body of any 520 response from 
scraper
+                               callback(error, 404, null);
+                       }
+               });
+       }
+
+       zoteroWebRequest(requestedURL, format, function(error, response, body) {
+               logger.log('debug/zotero', "Zotero request made for: " + 
requestedURL);
+               if (error) {
+                       logger.log('warn/zotero', error);
+                       self.stats.increment('zotero.req.error');
+                       scrapeAndCheck();
+               } else if (!response) {
+                       callback(true, 404, null);
+               } else {
+                       self.stats.increment('zotero.req.' + 
Math.floor(response.statusCode / 100)
+                       + 'xx');
+
+                       var citationInBody = Array.isArray(response.body) && 
response.body[0];
+                       var okBody = response.statusCode === 200 && 
citationInBody;
+                       var notImplemented = response.statusCode === 501;
+
+                       if (okBody) {
+                               logger.log('debug/zotero', "Successfully 
retrieved and translated\
+                                body from Zotero");
+                               callback (null, 200, body);
+                       } else {
+
+                               // 501 not implemented can indicate that the 
link is invalid, or
+                               // that the translator is broken. Translators 
can break at any
+                               // time if the website's url structure is 
changed.
+                               if (notImplemented){
+                                       logger.log('trace/zotero', "Zotero not 
implemented error; \
+                                       PubMed ID may be invalid");
+                               }
+
+                               // Zotero ideally should return 501 for http 
errors at the
+                               // location, but the PubMed translator is 
currently broken and
+                               // returns 200 and empty body for http errors 
for PMCs only
+                               if (!citationInBody){
+                                       logger.log('trace/zotero', "Zotero 
returned 200 but with \
+                                       empty body; PubMed ID may be invalid");
+                               }
+
+                               if (!citationInBody || notImplemented) {
+                                       scrapeAndCheck();
+                               } else {
+                                       callback(true, 404, null);
+                               }
+                       }
                }
        });
 };
+
 
 /**
  * Determine type of string (doi, url) and callback on correct handler
@@ -175,24 +287,21 @@
  * @param  {Function} callback          callback(extractedValue, 
correctFunction)
  */
 CitoidService.prototype.distinguish = function(rawSearchInput, callback){
-       var reDOI, rePMID, rePMCID, rePMCID2, reHTTP, reWWW,
-               parsedURL,
-               matchDOI, matchPMID, matchPMCID, matchHTTP, matchWWW,
-               search = rawSearchInput.trim();
 
-       reHTTP = new RegExp('^((https?)://.+\\..+)'); // Assumes all strings 
with http/s protocol are URLs
-       reWWW = new RegExp('^((www)\\..+\\..+)'); // Assumes all strings with 
www substring are URLs
-       reDOI = new RegExp('\\b10\\.?[0-9]{3,4}(?:[.][0-9]+)*/.*');
-       rePMID = new RegExp('^\\d{8}\\b');
-       rePMCID = new RegExp('\\bPMC\\d{7}\\b');
-       rePMCID2 = new RegExp('^\\d{7}\\b');
+       var search = rawSearchInput.trim();
 
-       matchHTTP = search.match(reHTTP);
-       matchDOI = search.match(reDOI);
-       matchPMID = search.match(rePMID);
-       matchPMCID = search.match(rePMCID);
-       matchWWW = search.match(reWWW);
+       var reHTTP = new RegExp('^((https?)://.+\\..+)'); // Assumes all 
strings with http/s protocol are URLs
+       var reWWW = new RegExp('^((www)\\..+\\..+)'); // Assumes all strings 
with www substring are URLs
+       var reDOI = new RegExp('\\b10\\.?[0-9]{3,4}(?:[.][0-9]+)*/.*');
+       var rePMID = new RegExp('^\\d{8}\\b');
+       var rePMCID = new RegExp('\\bPMC\\d{7}\\b');
+       var rePMCID2 = new RegExp('^\\d{7}\\b');
 
+       var matchHTTP = search.match(reHTTP);
+       var matchDOI = search.match(reDOI);
+       var matchPMID = search.match(rePMID);
+       var matchPMCID = search.match(rePMCID);
+       var matchWWW = search.match(reWWW);
 
        if (matchHTTP || matchWWW){
                this.stats.increment('input.url');
@@ -202,18 +311,18 @@
                callback(matchDOI[0], this.requestFromDOI.bind(this));
        } else if (matchPMID) {
                this.stats.increment('input.pmid');
-               callback(matchPMID[0], this.requestFromPubMedID.bind(this));
+               callback(matchPMID[0], this.requestFromPM.bind(this, 'PMID'));
        } else if (matchPMCID) {
                this.stats.increment('input.pmcid');
-               callback(matchPMCID[0], this.requestFromPubMedID.bind(this));
+               callback(matchPMCID[0], this.requestFromPM.bind(this, 'PMCID'));
        } else {
-               matchPMCID = search.match(rePMCID2);
+               matchPMCID = search.match(rePMCID2); // Detects PMCIDs with no 
PMC prefix
                if (matchPMCID) {
                        this.stats.increment('input.pmcid');
-                       callback('PMC' + matchPMCID[0], 
this.requestFromPubMedID.bind(this));
+                       callback('PMC' + matchPMCID[0], 
this.requestFromPM.bind(this, 'PMCID'));
                } else {
                        this.stats.increment('input.url');
-                       parsedURL = urlParse.parse(search);
+                       var parsedURL = urlParse.parse(search);
                        if (!parsedURL.protocol){
                                search = 'http://'+ search;
                        }
@@ -228,15 +337,19 @@
  * @param  {Function} callback  callback(error, responseCode, citation)
  */
 CitoidService.prototype.scrape = function(opts, callback){
-       var cbResCode, // Response Code to send to the callback
-               format = opts.format,
-               scrape = this.scraper.scrape.bind(this.scraper),
-               zoteroExportRequest = 
this.zoteroService.zoteroExportRequest.bind(this.zoteroService);
+       var cbResCode; // Response Code to send to the callback
+       var self = this;
+       var format = opts.format;
+       var scrape = this.scraper.scrape.bind(this.scraper);
+       var zoteroExportRequest = 
this.zoteroService.zoteroExportRequest.bind(this.zoteroService);
+
        if (format === 'bibtex') {
                scrape(opts, function(error, scrapeResCode, citation) {
                        zoteroExportRequest(citation[0], format, function(err, 
zotResCode, body) {
-                               if (zotResCode !== 200){
-                                       body = "Unable to serve "+ format +" 
format at this time";
+                               if (err || zotResCode !== 200){
+                                       self.logger.log('trace/zotero', "Unable 
to translate to export format bibtex");
+                                       var message  = "Unable to serve " + 
format + " format at this time";
+                                       body = {Error: message};
                                        cbResCode = 404; // 404 error if cannot 
translate into alternate format
                                } else if (scrapeResCode !== 200){
                                        cbResCode = 520; // 520 error if the 
scraper couldn't scrape from url
diff --git a/package.json b/package.json
index 1707ab2..e283dd8 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "citoid",
-  "version": "0.2.4",
+  "version": "0.2.5",
   "description": "Converts search terms such as URL or DOI into citations.",
   "scripts": {
     "start": "service-runner",
diff --git a/test/features/errors/index.js b/test/features/errors/index.js
index b997ed9..b56e120 100644
--- a/test/features/errors/index.js
+++ b/test/features/errors/index.js
@@ -77,7 +77,32 @@
                        assert.status(res, 404);
                }, function(err) {
                        assert.status(err, 404);
-                       assert.deepEqual(err.body.Error, 'Unable to resolve 
DOI',
+                       assert.deepEqual(err.body.Error, 'Unable to resolve DOI 
' + doi,
+                               'Unexpected error message ' + err.body.Error);
+               });
+       });
+
+       it('bad pmid', function() {
+               var pmid = '99999999';
+               return server.query(pmid, 'mediawiki', 'en')
+               .then(function(res) {
+                       assert.status(res, 404);
+               }, function(err) {
+                       assert.status(err, 404);
+                       assert.deepEqual(err.body.Error,
+                               'Unable to locate resource with PMID ' + pmid,
+                               'Unexpected error message ' + err.body.Error);
+               });
+       });
+
+       it('bad pmcid', function() {
+               var pmcid = 'PMC9999999';
+               return server.query(pmcid, 'mediawiki', 'en')
+               .then(function(res) {
+                       assert.status(res, 404);
+               }, function(err) {
+                       assert.status(err, 404);
+                       assert.deepEqual(err.body.Error, 'Unable to locate 
resource with PMCID ' + pmcid,
                                'Unexpected error message ' + err.body.Error);
                });
        });
diff --git a/test/features/scraping/index.js b/test/features/scraping/index.js
index ff2e333..5538399 100644
--- a/test/features/scraping/index.js
+++ b/test/features/scraping/index.js
@@ -12,8 +12,23 @@
 
        before(function () { return server.start(); });
 
-       it('pmid', function() {
-               return server.query('23555203').then(function(res) {
+       //PMID on NIH website that is not found in the id converter api
+       it('pmid (not in id converter)', function() {
+               return server.query('14656957').then(function(res) {
+                       assert.status(res, 200);
+                       assert.checkCitation(res, 'Seventh report of the Joint 
National Committee on Prevention, Detection, Evaluation, and Treatment of High 
Blood Pressure');
+               });
+       });
+
+       it('pmcid with prefix', function() {
+               return server.query('PMC3605911').then(function(res) {
+                       assert.status(res, 200);
+                       assert.checkCitation(res, 'Viral Phylodynamics');
+               });
+       });
+
+       it('pmcid without prefix', function() {
+               return server.query('3605911').then(function(res) {
                        assert.status(res, 200);
                        assert.checkCitation(res, 'Viral Phylodynamics');
                });

-- 
To view, visit https://gerrit.wikimedia.org/r/200314
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ie2830d42fa63fecd30db72f723e6d5d9979f51b5
Gerrit-PatchSet: 4
Gerrit-Project: mediawiki/services/citoid
Gerrit-Branch: master
Gerrit-Owner: Mvolz <[email protected]>
Gerrit-Reviewer: Mobrovac <[email protected]>
Gerrit-Reviewer: Mvolz <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to