Mvolz has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/219802

Change subject: [WIP] Cleaner import/export of Zotero & Scraper
......................................................................

[WIP] Cleaner import/export of Zotero & Scraper

WIP: Move data enrichment into Scraper.js

TODO: Move data enrichment into ZoteroService.js

TODO: Move date enrichment out of Exporter.js

Change-Id: Ida32d543026c2bab79f23b33d406fba96097d0bc
---
M lib/Exporter.js
M lib/Scraper.js
M test/features/scraping/index.js
M test/utils/assert.js
4 files changed, 100 insertions(+), 23 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/citoid 
refs/changes/02/219802/1

diff --git a/lib/Exporter.js b/lib/Exporter.js
index 1bdccda..732288b 100644
--- a/lib/Exporter.js
+++ b/lib/Exporter.js
@@ -246,6 +246,7 @@
                        }
                }
                return citation;
+       // TODO: make this work for any pubmed id, i.e. add doi from pubmed id.
        } else if (!citation.PMID && citation.DOI) {
                //if pmid is not found, lookup the pmid using the DOI
                return pubMedRequest(citation.DOI, 
defaultLogger).then(function(object){
@@ -362,4 +363,6 @@
 /* Exports */
 module.exports = Exporter;
 
+module.exports.addPubMedIdentifiers = addPubMedIdentifiers;
+
 
diff --git a/lib/Scraper.js b/lib/Scraper.js
index 1af37b7..9eeb2f6 100644
--- a/lib/Scraper.js
+++ b/lib/Scraper.js
@@ -21,6 +21,7 @@
  */
 var og = require('./translators/openGraph.js');
 var gen = require('./translators/general.js');
+var addPubMed = require('./Exporter.js').addPubMedIdentifiers;
 
 var Scraper = function(app){
 
@@ -48,7 +49,7 @@
        var acceptLanguage = cr.acceptLanguage;
        var url = cr.url || cr.idValue;
        var userAgent = this.userAgent;
-       var citation = {};
+       var citation = citationFromCR(cr); // Promise for citation
 
        var j = request.jar(); // One time use cookie jar
 
@@ -73,10 +74,9 @@
                                logger.log('warn/scraper', "Status from 
resource server at " + url +
                                        ": " + response.status);
                        }
-                       citation = build520(citation, url);
-                       cr.response.citation.push(citation);
-                       cr.response.responseCode = 520;
-                       return cr;
+                       return citation.then(function(citation){
+                               return build520(cr);
+                       });
                } else {
                        var str; // String from decoded Buffer object
                        var defaultCT = 'utf-8'; // Default content-type
@@ -116,29 +116,35 @@
 
                        // If the html has been successfully loaded into 
cheerio, proceed.
                        if (chtml){
-                               return scraper.parseHTML(url, chtml, citation)
-                               .then(function(citation){
-                                       logger.log('debug/scraper', 
"Sucessfully scraped resource at " + url);
-                                       cr.response.citation.push(citation);
-                                       cr.response.responseCode = 200;
-                                       return cr;
+                               // Create initial citation, which returns 
citation
+                               return citation.then(
+                                       function(citation){
+                                       scraper.parseHTML(url, chtml, 
citation).then(
+                                       // Success handler for parseHTML
+                                       function(citation){
+                                               logger.log('debug/scraper', 
"Sucessfully scraped resource at " + url);
+                                               cr.response.responseCode = 200;
+                                               return cr;
+                                       },
+                                       // Rejection handler for parseHTML
+                                       function(){
+                                               return build520(cr);
+                                       });
                                });
                        } else {
-                               citation = build520(citation, url);
                                logger.log('debug/scraper', "Failed to scrape 
resource at " + url);
-                               cr.response.citation.push(citation);
-                               cr.response.responseCode  = 520;
-                               return cr;
+                               return citation.then(function(citation){
+                                       return build520(cr);
+                               });
                        }
                }
        })
        .catch(function(error){
-               citation = build520(citation, url);
                logger.log('warn/scraper', error);
                logger.log('debug/scraper', "Failed to scrape resource at " + 
url);
-               cr.response.citation.push(citation);
-               cr.response.responseCode  = 520;
-               return cr;
+               return citation.then(function(citation){
+                       return build520(cr);
+               });
        });
 };
 
@@ -360,11 +366,47 @@
  * @param  {String} url      requested url
  * @return {Object}          filled in citation object
  */
-function build520(citation, url){
+function build520(cr){
+       var citation = cr.response.citation[0];
 
        citation.itemType = 'webpage';
-       citation.url = url;
-       citation.title = url;
-       return citation;
+       citation.title = citation.url;
+
+       cr.response.responseCode = 520;
+       return cr;
 
 }
+
+/**
+ * Create initial citation from empty cr.response
+ * @param  {Object} cr){       var citation      [description]
+ * @return {[type]}           [description]
+ */
+var citationFromCR = BBPromise.method(function(cr){
+       // Push empty citation to response if one does not exist already.
+       // TODO: Do this further upstream
+       if (!cr.response.citation[0]){
+               cr.response.citation.push({});
+       }
+       var citation = cr.response.citation[0]; // Pointer to citation
+       // Set requested identifier as part of citation
+       citation[cr.idType] = cr.idValue;
+       // If not already set from idType, set url from cr.url
+       // For instance, from a resolved DOI
+       if (!citation.url){ citation.url = cr.url; }
+       // Try to get doi from url
+       if (!citation.doi){
+               var reDOI = new RegExp('\\b10\\.?[0-9]{3,4}(?:[.][0-9]+)*/.*');
+               var matchDOI = citation.url.match(reDOI);
+               // TODO: Actually resolve to make sure DOI is valid; separate 
function out from requestFromDOI
+               // Could potentially skip this if we know it's dx.doi.org
+               // Some links will be false positive if doi is in the middle of 
url since there's no way to signal end of doi
+               if (matchDOI[0]) { citation.doi = matchDOI[0]; }
+
+       }
+       // TODO: Make below function work for any pubmed id, i.e. add doi from 
pubmed id. in exporter
+       // TODO: Test block
+       return addPubMed(citation).then(function(citation){
+               return cr.response.citation[0];
+       });
+});
\ No newline at end of file
diff --git a/test/features/scraping/index.js b/test/features/scraping/index.js
index 85d3fbf..4245f35 100644
--- a/test/features/scraping/index.js
+++ b/test/features/scraping/index.js
@@ -126,6 +126,34 @@
                                assert.checkCitation(res);
                        });
                });
+
+               // Ensure DOI is present in non-zotero scraped page when 
requested from doi
+               it('doi pointing to resource not in zotero', function() {
+                       return 
server.query('10.2307/3677029').then(function(res) {
+                               assert.status(res, 200);
+                               assert.checkCitation(res);
+                               assert.notDeepEqual(res.body[0].doi, undefined, 
'Missing DOI');
+                       });
+               });
+
+               // Ensure DOI is present in non-zotero scraped page when 
request from doi link
+               it('dx.doi link pointing to resource not in zotero', function() 
{
+                       return 
server.query('http://dx.doi.org/10.2307/3677029').then(function(res) {
+                               assert.status(res, 200);
+                               assert.checkCitation(res);
+                               assert.notDeepEqual(res.body[0].doi, undefined, 
'Missing DOI');
+                       });
+               });
+
+               // Ensure DOI is present in non-zotero scraped page where 
scraping fails
+               it('doi pointing to resource that can\'t be scraped', 
function() {
+                       return 
server.query('10.1038/scientificamerican0200-90').then(function(res) {
+                               assert.status(res, 520);
+                               assert.checkCitation(res);
+                               assert.notDeepEqual(res.body[0].doi, undefined, 
'Missing DOI');
+                       });
+               });
+
        });
 
        // The following tests require the WMF fork of the zotero translators, 
as found
diff --git a/test/utils/assert.js b/test/utils/assert.js
index 40bbf0b..ce57ed0 100644
--- a/test/utils/assert.js
+++ b/test/utils/assert.js
@@ -101,6 +101,10 @@
                throw new Error('Expected to receive an array of citations, 
got: ' + JSON.stringify(cit));
        }
 
+       if(cit.length !== 1){
+               throw new Error('Expected to receive an array of 1 citation, 
got: ' + cit.length);
+       }
+
        cit = cit[0];
 
        // Check presence of all required fields

-- 
To view, visit https://gerrit.wikimedia.org/r/219802
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ida32d543026c2bab79f23b33d406fba96097d0bc
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/citoid
Gerrit-Branch: master
Gerrit-Owner: Mvolz <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to