Mobrovac has submitted this change and it was merged.
Change subject: Fix addItemType function to use og and dc
......................................................................
Fix addItemType function to use og and dc
Previously addItemType was not adding
itemTypes for dublinCore metadata at all,
and the adding the type using openGraph
metadata was broken. This change adds
dc metadata to type determination and
fixes the og type determination.
Tests added.
Bug: T133540
Change-Id: I822c70f0f2bd82d60a6da4434800c99f20c4d9dc
---
M lib/Scraper.js
M lib/translators/dublinCore.js
M test/features/unit/scraper.js
3 files changed, 45 insertions(+), 14 deletions(-)
Approvals:
Mobrovac: Looks good to me, approved
jenkins-bot: Verified
diff --git a/lib/Scraper.js b/lib/Scraper.js
index 82c55d3..174d539 100644
--- a/lib/Scraper.js
+++ b/lib/Scraper.js
@@ -204,7 +204,7 @@
* @param {Object} chtml Cheerio object
* @return {String} Content-type string or null
*/
-exports.contentTypeFromBody= function(chtml){
+exports.contentTypeFromBody = function(chtml){
// TODO: Stream and read buffer with regex
var charset = chtml('meta[charset]').first().attr('charset'); // i.e.
<meta charset="iso-8859-1" />
if (charset) {return charset;}
@@ -240,9 +240,11 @@
* @return {Object} Bluebird promise for citation object
*/
Scraper.prototype.parseHTML = function(cr, chtml){
+ var logger = this.logger;
var citation = cr.response.citation[0];
- function doSyncMethods(metadata, cit){
+ var addMetadata = BBPromise.method(function(metadata, cit){
+ logger.log('debug/scraper', "Running syncronous methods");
cit = addItemType(metadata, cit);
@@ -270,7 +272,7 @@
}
return cit;
- }
+ });
return parseAll(chtml)
.then(function(metadata){
@@ -289,15 +291,17 @@
}
}
return crossRef(cr, metadata).then(function(cit){
- return doSyncMethods(metadata, cit);
+ return addMetadata(metadata, cit);
},
// Rejection handler for crossRef
function(){
- return doSyncMethods(metadata, citation);
+ logger.log('debug/scraper', "crossRef failure");
+ return addMetadata(metadata, citation);
});
},
// Rejection handler for parseAll
function(){
+ logger.log('debug/scraper', "ParseAll failure");
return fallback(citation);
});
@@ -360,8 +364,8 @@
* @return {Object} citation object
*/
function addItemType(metadata, citation){
-
citation = citation || {};
+ metadata = metadata || {};
// Set citation type from metadata
if (!citation.itemType){ // Don't overwrite itemtype
@@ -371,14 +375,16 @@
else if (metadata.highwirePress){
citation.itemType =
itemTypeFromPress(metadata.highwirePress);
}
- else if (metadata['type'] && og.types[metadata['type']]){ // if
there is a type in the results and that type is defined in openGraph.js
- citation.itemType = og.types[metadata['type']];
+ else if (metadata.openGraph && metadata.openGraph['type'] &&
og.types[metadata.openGraph['type']]){ // if there is a type in the results and
that type is defined in openGraph.js
+ citation.itemType =
og.types[metadata.openGraph['type']];
+ }
+ else if (metadata.dublinCore && metadata.openGraph['type'] &&
dc.types[metadata.dublinCore['type']]){ // if there is a type in the results
and that type is defined in dublinCore.js
+ citation.itemType =
dc.types[metadata.dublinCore['type']];
}
else {
citation.itemType = 'webpage'; //default itemType
}
}
-
return citation;
}
@@ -460,9 +466,9 @@
return crossRef(cr, citation).then(function(cit){
citation = fallback520(cit, 200);
defaultLogger.log('debug/scraper', "Sucessfully got metadata
from doi " + cr.doi);
- return cr;
// Rejection
}, function(){
+ defaultLogger.log('info/scraper', "Unable to get any metadata
from doi " + cr.doi + "; returning 520 response.");
citation = fallback520(citation, 520);
return cr;
});
@@ -524,4 +530,5 @@
}
module.exports.translate = translate;
-module.exports.itemTypeFromPress = itemTypeFromPress;
\ No newline at end of file
+module.exports.itemTypeFromPress = itemTypeFromPress;
+module.exports.addItemType = addItemType;
\ No newline at end of file
diff --git a/lib/translators/dublinCore.js b/lib/translators/dublinCore.js
index b88fdd6..f31e48a 100644
--- a/lib/translators/dublinCore.js
+++ b/lib/translators/dublinCore.js
@@ -20,6 +20,7 @@
Dataset: 'webpage',
Event: 'webpage',
Image: 'artwork',
+ 'Image.Moving': 'videoRecording',
InteractiveResource: 'webpage',
MovingImage: 'videoRecording',
PhysicalObject: 'webpage',
@@ -27,7 +28,7 @@
Software: 'computerProgram',
Sound: 'audioRecording',
StillImage: 'artwork',
- Text: 'webpage',
+ Text: 'webpage'
};
/**
diff --git a/test/features/unit/scraper.js b/test/features/unit/scraper.js
index 0733926..cbf4a20 100644
--- a/test/features/unit/scraper.js
+++ b/test/features/unit/scraper.js
@@ -28,7 +28,7 @@
{value:article, name:'article'}
];
-describe('lib/Scraper.js translate function: ', function() {
+describe('translate function: ', function() {
var types = new CachedTypes();
var citation;
var result;
@@ -69,4 +69,27 @@
});
});
});
-});
\ No newline at end of file
+});
+
+describe('addItemType function: ', function() {
+ it('sets videoRecording itemType', function() {
+ return meta.parseAll(movie).then(function(metadata){
+ var itemType = scraper.addItemType(metadata, {}).itemType;
+ assert.deepEqual(itemType, 'videoRecording', 'Expected itemType
videoRecording, got itemType ' + itemType);
+ });
+ });
+
+ it('sets article itemType', function() {
+ return meta.parseAll(article).then(function(metadata){
+ var itemType = scraper.addItemType(metadata, {}).itemType;
+ assert.deepEqual(itemType, 'journalArticle', 'Expected itemType
journalArticle, got itemType ' + itemType);
+ });
+ });
+
+ it('sets itemType webpage if no relevant metadata available', function() {
+ var metadata = {general:{title:'Example domain'}};
+ var itemType = scraper.addItemType(metadata, {}).itemType;
+ assert.deepEqual(itemType, 'webpage', 'Expected itemType webpages, got
itemType ' + itemType);
+
+ });
+});
--
To view, visit https://gerrit.wikimedia.org/r/285224
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I822c70f0f2bd82d60a6da4434800c99f20c4d9dc
Gerrit-PatchSet: 5
Gerrit-Project: mediawiki/services/citoid
Gerrit-Branch: master
Gerrit-Owner: Mvolz <[email protected]>
Gerrit-Reviewer: Mobrovac <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits