Mvolz has uploaded a new change for review.
https://gerrit.wikimedia.org/r/219802
Change subject: [WIP] Cleaner import/export of Zotero & Scraper
......................................................................
[WIP] Cleaner import/export of Zotero & Scraper
WIP: Move data enrichment into Scraper.js
TODO: Move data enrichment into ZoteroService.js
TODO: Move date enrichment out of Exporter.js
Change-Id: Ida32d543026c2bab79f23b33d406fba96097d0bc
---
M lib/Exporter.js
M lib/Scraper.js
M test/features/scraping/index.js
M test/utils/assert.js
4 files changed, 100 insertions(+), 23 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/citoid
refs/changes/02/219802/1
diff --git a/lib/Exporter.js b/lib/Exporter.js
index 1bdccda..732288b 100644
--- a/lib/Exporter.js
+++ b/lib/Exporter.js
@@ -246,6 +246,7 @@
}
}
return citation;
+ // TODO: make this work for any pubmed id, i.e. add doi from pubmed id.
} else if (!citation.PMID && citation.DOI) {
//if pmid is not found, lookup the pmid using the DOI
return pubMedRequest(citation.DOI,
defaultLogger).then(function(object){
@@ -362,4 +363,6 @@
/* Exports */
module.exports = Exporter;
+module.exports.addPubMedIdentifiers = addPubMedIdentifiers;
+
diff --git a/lib/Scraper.js b/lib/Scraper.js
index 1af37b7..9eeb2f6 100644
--- a/lib/Scraper.js
+++ b/lib/Scraper.js
@@ -21,6 +21,7 @@
*/
var og = require('./translators/openGraph.js');
var gen = require('./translators/general.js');
+var addPubMed = require('./Exporter.js').addPubMedIdentifiers;
var Scraper = function(app){
@@ -48,7 +49,7 @@
var acceptLanguage = cr.acceptLanguage;
var url = cr.url || cr.idValue;
var userAgent = this.userAgent;
- var citation = {};
+ var citation = citationFromCR(cr); // Promise for citation
var j = request.jar(); // One time use cookie jar
@@ -73,10 +74,9 @@
logger.log('warn/scraper', "Status from
resource server at " + url +
": " + response.status);
}
- citation = build520(citation, url);
- cr.response.citation.push(citation);
- cr.response.responseCode = 520;
- return cr;
+ return citation.then(function(citation){
+ return build520(cr);
+ });
} else {
var str; // String from decoded Buffer object
var defaultCT = 'utf-8'; // Default content-type
@@ -116,29 +116,35 @@
// If the html has been successfully loaded into
cheerio, proceed.
if (chtml){
- return scraper.parseHTML(url, chtml, citation)
- .then(function(citation){
- logger.log('debug/scraper',
"Sucessfully scraped resource at " + url);
- cr.response.citation.push(citation);
- cr.response.responseCode = 200;
- return cr;
+ // Create initial citation, which returns
citation
+ return citation.then(
+ function(citation){
+ scraper.parseHTML(url, chtml,
citation).then(
+ // Success handler for parseHTML
+ function(citation){
+ logger.log('debug/scraper',
"Sucessfully scraped resource at " + url);
+ cr.response.responseCode = 200;
+ return cr;
+ },
+ // Rejection handler for parseHTML
+ function(){
+ return build520(cr);
+ });
});
} else {
- citation = build520(citation, url);
logger.log('debug/scraper', "Failed to scrape
resource at " + url);
- cr.response.citation.push(citation);
- cr.response.responseCode = 520;
- return cr;
+ return citation.then(function(citation){
+ return build520(cr);
+ });
}
}
})
.catch(function(error){
- citation = build520(citation, url);
logger.log('warn/scraper', error);
logger.log('debug/scraper', "Failed to scrape resource at " +
url);
- cr.response.citation.push(citation);
- cr.response.responseCode = 520;
- return cr;
+ return citation.then(function(citation){
+ return build520(cr);
+ });
});
};
@@ -360,11 +366,47 @@
* @param {String} url requested url
* @return {Object} filled in citation object
*/
-function build520(citation, url){
+function build520(cr){
+ var citation = cr.response.citation[0];
citation.itemType = 'webpage';
- citation.url = url;
- citation.title = url;
- return citation;
+ citation.title = citation.url;
+
+ cr.response.responseCode = 520;
+ return cr;
}
+
+/**
+ * Create initial citation from empty cr.response
+ * @param {Object} cr){ var citation [description]
+ * @return {[type]} [description]
+ */
+var citationFromCR = BBPromise.method(function(cr){
+ // Push empty citation to response if one does not exist already.
+ // TODO: Do this further upstream
+ if (!cr.response.citation[0]){
+ cr.response.citation.push({});
+ }
+ var citation = cr.response.citation[0]; // Pointer to citation
+ // Set requested identifier as part of citation
+ citation[cr.idType] = cr.idValue;
+ // If not already set from idType, set url from cr.url
+ // For instance, from a resolved DOI
+ if (!citation.url){ citation.url = cr.url; }
+ // Try to get doi from url
+ if (!citation.doi){
+ var reDOI = new RegExp('\\b10\\.?[0-9]{3,4}(?:[.][0-9]+)*/.*');
+ var matchDOI = citation.url.match(reDOI);
+ // TODO: Actually resolve to make sure DOI is valid; separate
function out from requestFromDOI
+ // Could potentially skip this if we know it's dx.doi.org
+ // Some links will be false positive if doi is in the middle of
url since there's no way to signal end of doi
+ if (matchDOI[0]) { citation.doi = matchDOI[0]; }
+
+ }
+ // TODO: Make below function work for any pubmed id, i.e. add doi from
pubmed id. in exporter
+ // TODO: Test block
+ return addPubMed(citation).then(function(citation){
+ return cr.response.citation[0];
+ });
+});
\ No newline at end of file
diff --git a/test/features/scraping/index.js b/test/features/scraping/index.js
index 85d3fbf..4245f35 100644
--- a/test/features/scraping/index.js
+++ b/test/features/scraping/index.js
@@ -126,6 +126,34 @@
assert.checkCitation(res);
});
});
+
+ // Ensure DOI is present in non-zotero scraped page when
requested from doi
+ it('doi pointing to resource not in zotero', function() {
+ return
server.query('10.2307/3677029').then(function(res) {
+ assert.status(res, 200);
+ assert.checkCitation(res);
+ assert.notDeepEqual(res.body[0].doi, undefined,
'Missing DOI');
+ });
+ });
+
+ // Ensure DOI is present in non-zotero scraped page when
request from doi link
+ it('dx.doi link pointing to resource not in zotero', function()
{
+ return
server.query('http://dx.doi.org/10.2307/3677029').then(function(res) {
+ assert.status(res, 200);
+ assert.checkCitation(res);
+ assert.notDeepEqual(res.body[0].doi, undefined,
'Missing DOI');
+ });
+ });
+
+ // Ensure DOI is present in non-zotero scraped page where
scraping fails
+ it('doi pointing to resource that can\'t be scraped',
function() {
+ return
server.query('10.1038/scientificamerican0200-90').then(function(res) {
+ assert.status(res, 520);
+ assert.checkCitation(res);
+ assert.notDeepEqual(res.body[0].doi, undefined,
'Missing DOI');
+ });
+ });
+
});
// The following tests require the WMF fork of the zotero translators,
as found
diff --git a/test/utils/assert.js b/test/utils/assert.js
index 40bbf0b..ce57ed0 100644
--- a/test/utils/assert.js
+++ b/test/utils/assert.js
@@ -101,6 +101,10 @@
throw new Error('Expected to receive an array of citations,
got: ' + JSON.stringify(cit));
}
+ if(cit.length !== 1){
+ throw new Error('Expected to receive an array of 1 citation,
got: ' + cit.length);
+ }
+
cit = cit[0];
// Check presence of all required fields
--
To view, visit https://gerrit.wikimedia.org/r/219802
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ida32d543026c2bab79f23b33d406fba96097d0bc
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/citoid
Gerrit-Branch: master
Gerrit-Owner: Mvolz <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits