[MediaWiki-commits] [Gerrit] Use html-metadata library - change (mediawiki...citoid)

Mobrovac (Code Review) Thu, 26 Feb 2015 11:04:07 -0800

Mobrovac has submitted this change and it was merged.

Change subject: Use html-metadata library
......................................................................



Use html-metadata library

Use-html metadata library to extract additional
metadata from scraped pages.

Add translator folder to contain files to translate
between scraped metadata and interal format (Zotero).

Add translators for general metadata and for OpenGraph
metadata. Support for vertical types in OpenGraph.js
is incomplete.

Bug: T1069

Change-Id: I8d571305e853f41748ade494f52430a44e11bf75
---
M Gruntfile.js
M lib/requests.js
M lib/scrape.js
A lib/translators/README.md
A lib/translators/general.js
A lib/translators/openGraph.js
M package.json
7 files changed, 263 insertions(+), 58 deletions(-)

Approvals:
  Mobrovac: Looks good to me, approved
  GWicke: Looks good to me, but someone else must approve



diff --git a/Gruntfile.js b/Gruntfile.js
index 554661b..4bf92a2 100644
--- a/Gruntfile.js
+++ b/Gruntfile.js
@@ -13,7 +13,8 @@
                        all: [
                                '*.js',
                                'localsettings.js.sample',
-                               'lib/*.js'
+                               'lib/*.js',
+                               'lib/translators/*.js'
                        ]
                }
        });
diff --git a/lib/requests.js b/lib/requests.js
index a198730..73d2795 100644
--- a/lib/requests.js
+++ b/lib/requests.js
@@ -11,7 +11,7 @@
 
 /* Import Local Modules */
 var unshorten = require('./unshorten.js'),
-       scrape = require('./scrape.js').scrape,
+       scrape = require('./scrape.js'),
        zoteroWebRequest = require('./zotero.js').zoteroWebRequest,
        zoteroExportRequest = require('./zotero.js').zoteroExportRequest,
        pubMedRequest = require('./pubMedRequest.js');
diff --git a/lib/scrape.js b/lib/scrape.js
index 21e4c1b..38abf8e 100644
--- a/lib/scrape.js
+++ b/lib/scrape.js
@@ -3,82 +3,176 @@
  * https://www.mediawiki.org/wiki/citoid
  */
 
-/* Import Modules */
+/*
+  Module dependencies
+*/
+
 var request = require('request'),
        urlParse = require('url'),
-       cheerio = require('cheerio');
+       cheerio = require('cheerio'),
+       parseMetaData = require('html-metadata').parseAll,
+       bunyan = require('bunyan'),
+       og = require('./translators/openGraph.js'),
+       gen = require('./translators/general.js'),
+       log = bunyan.createLogger({name: "citoid"});
 
 /**
- * Currently scrapes title only
+ * Scrapes, parses, and translates webpages to obtain Zotero format metadata
  * callback runs on list of json objs (var body)
  * @param  {String}   url      url to scrape
  * @param  {Function} callback callback(error, statusCode, body)
  */
 
-var scrape = function(url, callback){
+exports = module.exports = function(url, callback){
 
-       var $;
-
-       function getTitle() {
-
-               var title;
-
-               // Try to get title from itemprop="heading"
-               title = $('*[itemprop~="headline"]').first().text();
-               if (title) { return title; }
-
-               // Try to get title from <title> tag
-               title = $('title').first().text();
-               if (title) { return title; }
-
-               // Default
-               return url;
-       }
+       var chtml,
+               citation = {url: url, title: url};
 
        request(
                {
                        url: url,
-                       followAllRedirects: true
+                       followAllRedirects: true,
                }, function(error, response, html){
-
-                       var citation = {itemType: 'webpage', url: url, title: 
url};
 
                        if (error || !response || response.statusCode !== 200) {
                                callback(error, 520, [citation]);
                                return;
+                       } else {
+                               try {
+                                       chtml = cheerio.load(html);
+                                       citation.title = null;
+                                       exports.parseHTML(url, chtml, citation, 
function(citation){
+                                               citation = citation;
+                                               callback(null, 200, [citation]);
+                                       });
+                               } catch (e){
+                                       log.error(e);
+                                       callback(error, 520, [citation]);
+                               }
                        }
-
-                       try{
-                               $ = cheerio.load(html);
-                       }
-                       catch (e){
-                               callback(error, 520, [citation]);
-                       }
-
-                       citation.title = getTitle();
-
-                       // Access date on format YYYY-MM-DD
-                       citation.accessDate = (new 
Date()).toISOString().substring(0, 10);
-
-                       var parsedUrl = response.request.uri ? 
response.request.uri : urlParse.parse(url);
-
-                       if (citation.title && parsedUrl && parsedUrl.hostname) {
-                               citation.publicationTitle = parsedUrl.hostname;
-                       }
-
-                       callback(error, 200, [citation]);
        });
 };
 
+/**
+ * Adds metadata to citation object given a metadata of a
+ * specific type, and a translator specific to that metadata type
+ * @param  {Object} metaData   [description]
+ * @param  {Object} translator
+ */
+exports.translate = function(citation, metaData, translator){
+       var translatedProperty, value;
+       for (var key in metaData){ // Loop through results
+               translatedProperty = translator[key]; // Look up property in 
translator
+               if (translatedProperty && !citation[translatedProperty]){ // If 
it has a corresponding translation and won't overwrite properties already set
+                       //either set value to property or modify with function
+                       if (typeof translatedProperty === 'string'){
+                               value = metaData[key];
+                       } else if (typeof translatedProperty === 'function'){
+                               citation = translatedProperty(metaData[key], 
citation);
+                       } else {return citation;}
+
+                       if (typeof value === 'string'){
+                               citation[translatedProperty] = metaData[key]; 
// Add value of property to citation object
+                       } else if (Array.isArray(value)) {
+                               citation[translatedProperty] = 
metaData[key][0]; // Choose first value if array
+                       }
+               }
+       }
+       return citation;
+};
+
+/**
+ * Adds html metadata to a given citation object given
+ * the html loaded into cheerio
+ * @param  {String}   url      url being scraped
+ * @param  {Objct}   chtml     Cheerio object with html loaded
+ * @param  {Object}   citation a citation object contain default parameters
+ * @param  {Function} callback callback(citation)
+ */
+exports.parseHTML = function(url, chtml, citation, callback){
+       var metaData, typeTranslator;
+
+       parseMetaData(chtml, function(err, results){
+               metaData = results; //only use open graph here
+       });
+
+       // translator/openGraph.js properties
+
+       // Set zotero type from OpenGraph type
+       if (metaData.openGraph['type'] && 
og.types[metaData.openGraph['type']]){ // if there is a type in the results and 
that type is defined in openGraph.js
+               citation.itemType = og.types[metaData.openGraph['type']];
+       }
+       else {
+               citation.itemType = 'webpage'; //default itemType
+       }
+
+       // Add universal (non type specific) OpenGraph properties
+       citation = exports.translate(citation, metaData.openGraph, og.general);
+
+       // Add type specific Open Graph properties
+       typeTranslator = og[citation.itemType];
+       if (typeTranslator){
+               citation = exports.translate(citation, metaData.openGraph, 
typeTranslator);
+       }
+
+       // Fall back on general metadata
+       citation  = exports.translate(citation, metaData.general, gen.general);
+
+       // Fall back methods
+
+       // Title
+       if (!citation.title){
+               citation.title = exports.getTitle(url, chtml);
+       }
+
+       // Access date - universal - format YYYY-MM-DD
+       citation.accessDate = (new Date()).toISOString().substring(0, 10);
+
+       // Fall back publication title - webpage only
+       if (!citation.publicationTitle && citation.itemType === 'webpage'){
+               var parsedUrl = urlParse.parse(url);
+               if (citation.title && parsedUrl && parsedUrl.hostname) {
+                       citation.publicationTitle = parsedUrl.hostname;
+               }
+       }
+       callback(citation);
+};
+
+/**
+ * Gets title in other ways if not metadata is available
+ * @param  {String} url   url
+ * @param  {Object} chtml Cheerio object with html loaded
+ * @return {String}       best title available for citation
+ **/
+
+exports.getTitle = function(url, chtml) {
+
+               var title;
+
+               // Try to get title from itemprop="heading" // Schema.org 
microdata
+               title = chtml('*[itemprop~="headline"]').first().text();
+               if (title) { return title; }
+
+               // Default
+               return url;
+       };
+
+/**
+ * Test methods in main
+ */
+
 if (require.main === module) {
-       var sampleUrl = 'http://example,.com';
-       console.log('scrape fcn running on sample url:'+sampleUrl);
-       scrape(sampleUrl, function(error, body){
-               console.log(error);
+       var fs = require('fs'),
+               sampleUrl = 
'http://blog.woorank.com/2013/04/dublin-core-metadata-for-seo-and-usability/';
+
+       exports(sampleUrl, function(error, responseCode, body){
+               console.log('scrape fcn running on sample url:'+sampleUrl);
                console.log(body);
        });
-}
 
-module.exports = {
-       scrape: scrape
-};
\ No newline at end of file
+       var $ = 
cheerio.load(fs.readFileSync('./node_modules/html-metadata/test_files/turtle_movie.html'));
+       exports.parseHTML('http://www.example.com', $, {}, function(results){
+               console.log('Parser running on test file');
+               console.log(results);
+       });
+}
diff --git a/lib/translators/README.md b/lib/translators/README.md
new file mode 100644
index 0000000..bd8a633
--- /dev/null
+++ b/lib/translators/README.md
@@ -0,0 +1,8 @@
+Citoid Translators
+=============
+
+Scrape.js uses these translators to convert between different types of 
embedded metadata and the internal Zotero format.
+
+For instance, Schema.org microdata has the type 
['WebPage'](http://schema.org/WebPage) where the title of the particlar page is 
termed 'headline'. In Zotero format (the internal format used by citoid), this 
would correspond to the type 'webpage' and the property 'title'.
+
+A translator is used to translate from the embedded metadata types and 
properties to the zotero types and properties to provide the most rich metadata 
possible.
\ No newline at end of file
diff --git a/lib/translators/general.js b/lib/translators/general.js
new file mode 100644
index 0000000..c4cf42f
--- /dev/null
+++ b/lib/translators/general.js
@@ -0,0 +1,49 @@
+#!/usr/bin/env node
+/**
+ * General field values : Zotero type field values
+ * @type {Object}
+ */
+
+exports.general = {
+               authorlink: null,
+               canonical: 'url',
+               description: 'abstract',
+               publisher: null,
+               robots: null,
+               shortlink: null,
+               title:  'title',
+};
+
+/**
+ * Converts the property 'author' to Zotero creator field
+ * @param  {String} authorText Text in author field
+ */
+
+exports.general.author = function(authorText, citation){
+       var creatorObj;
+       if (!authorText){
+               return citation;
+       }
+       if (!citation) {citation = {};}
+       authorText = authorText.trim().split(/\s/m);
+       creatorObj = {creatorType: 'author'};
+       creatorObj.creatorType = 'author';
+       if (authorText.length >= 1){
+               creatorObj.firstName = authorText[0];
+               creatorObj.lastName = "";
+       }
+       if (authorText.length >= 2 ){
+               creatorObj.lastName = authorText[authorText.length-1];
+       }
+       if (!citation.creators){
+               citation.creators = [];
+       }
+       citation.creators.push(creatorObj);
+       return citation;
+};
+
+
+/*Test methods in main */
+if (require.main === module) {
+       console.log(exports.general.author("Taylor Turtle"));
+}
diff --git a/lib/translators/openGraph.js b/lib/translators/openGraph.js
new file mode 100644
index 0000000..823ba80
--- /dev/null
+++ b/lib/translators/openGraph.js
@@ -0,0 +1,52 @@
+#!/usr/bin/env node
+/**
+ * Open graph type field values : Zotero type field values
+ * @type {Object}
+ */
+exports.types = {
+       website: 'webpage',
+       article:'blogPost', //or journalArticle, newspaperArticle, 
magazineArticle ?
+       book: 'book',
+       profile: 'webpage', //may be possible to obtain more information from 
this link a.k.a. names
+       'music.song': 'audioRecording',
+       'music.album': 'audioRecording',
+       'music.playlist': 'webpage',
+       'music.radiostation': 'webpage',
+       'video.movie': 'videoRecording',
+       'video.episode': 'videoRecording',
+       'video.tv_show': 'videoRecording',
+       'video.other' : 'videoRecording'
+};
+
+/**
+ * Open graph general properties : Zotero properties
+ * @type {Object}
+ */
+
+exports.general = {
+       title: 'title', // general OG property, common to all Zotero types
+       url:'url', // general OG property, common to all Zotero types
+       image: null, // general OG property, unused in any Zotero type //could 
possible put in archive location?
+       audio: null, // general OG property, unused in Zotero in any Zotero 
type //could possibly put in archive location?
+       description: 'abstract', // general OG property, abstract common to all 
Zotero types
+       locale: 'language', // general OG property, common to all Zotero types
+       determiner: null,  // general OG property, unused in any Zotero type
+       'locale:alternate': null, // general OG property, unused in any Zotero 
type
+       site_name: null, // general OG property, only used in webpage types - 
translate there
+       video: null // general OG property, unused in Zotero in any Zotero type 
//could possibly put in archive location?
+};
+
+/**
+ * Translator for Zotero type: webpage
+ * Open graph webpage properties : Zotero properties
+ * webpage has no specific properties other than what is defined in general og 
properties
+ * @type {Object}
+ */
+exports.webpage = {
+       site_name: 'publicationTitle' // prefix og: general property, but 
should only be assigned if type webpage is used
+};
+
+exports.videoRecording = {
+       duration: 'runningTime',
+       release_date: 'date'
+};
diff --git a/package.json b/package.json
index 6cbf985..fa0c739 100644
--- a/package.json
+++ b/package.json
@@ -1,7 +1,7 @@
 {
        "name" : "citoid",
        "version" : "0.0.0",
-       "description" : "Converts search terms such as URL or title into 
citations.",
+       "description" : "Converts search terms such as URL or DOI into 
citations.",
        "scripts": {
                "test": "grunt test"
        },
@@ -12,10 +12,11 @@
                "bunyan" : "1.2.3",
                "cheerio" : "0.18.0",
                "express" : "4.10.4",
+               "html-metadata": "0.1.0",
                "path": "0.4.9",
-               "request" : "2.49.0",
-               "xmldom" : "0.1.19",
-               "xpath" : "0.0.7",
+               "request": "2.49.0",
+               "xmldom": "0.1.19",
+               "xpath": "0.0.7",
                "yargs": "1.3.3"
        },
        "devDependencies": {

-- 
To view, visit https://gerrit.wikimedia.org/r/182975
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I8d571305e853f41748ade494f52430a44e11bf75
Gerrit-PatchSet: 12
Gerrit-Project: mediawiki/services/citoid
Gerrit-Branch: master
Gerrit-Owner: Mvolz <mv...@wikimedia.org>
Gerrit-Reviewer: GWicke <gwi...@wikimedia.org>
Gerrit-Reviewer: Jforrester <jforres...@wikimedia.org>
Gerrit-Reviewer: Mobrovac <mobro...@wikimedia.org>
Gerrit-Reviewer: Mvolz <mv...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Use html-metadata library - change (mediawiki...citoid)

Reply via email to