Mvolz has uploaded a new change for review. https://gerrit.wikimedia.org/r/182975
Change subject: [WIP] Open graph support ...................................................................... [WIP] Open graph support Change-Id: I8d571305e853f41748ade494f52430a44e11bf75 --- M lib/scrape.js M package.json 2 files changed, 47 insertions(+), 29 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/citoid refs/changes/75/182975/1 diff --git a/lib/scrape.js b/lib/scrape.js index ae07062..5d40a2b 100644 --- a/lib/scrape.js +++ b/lib/scrape.js @@ -3,10 +3,17 @@ * https://www.mediawiki.org/wiki/citoid */ -/* Import Modules */ +/* + Module dependencies +*/ + var request = require('request'), urlParse = require('url'), - cheerio = require('cheerio'); + cheerio = require('cheerio'), + og = require('open-graph'), + parseOG = require('open-graph').parse, + bunyan = require('bunyan'), + log = bunyan.createLogger({name: "citoid"}); /** * Currently scrapes title only @@ -38,37 +45,47 @@ request( { url: url, - followAllRedirects: true + followAllRedirects: true, + headers: {'user-agent': 'Mozilla/5.0'}, }, function(error, response, html){ + var ogData, + citation = {itemType: 'webpage', url: url, title: url}; - var citation = {itemType: 'webpage', url: url, title: url}; - - if (error || !response) { - console.log(error); + if (error){ callback(error, [citation]); - return; + } else { + + try { + $ = cheerio.load(html); + } catch (e){ + console.log('Could not load document: ' + e); + callback(null, [citation]); + return; + } + + ogData = parseOG(html); + console.log(ogData); + + //Title + if (ogData.title){ + citation.title = ogData.title; + } else { + citation.title = getTitle(); + } + + // Access date - format YYYY-MM-DD + citation.accessDate = (new Date()).toISOString().substring(0, 10); + + //Publication title + var parsedUrl = urlParse.parse(url); + + if (ogData.site_name){ + citation.publicationTitle = ogData.site_name; + } else if (citation.title && parsedUrl && parsedUrl.hostname) { + citation.publicationTitle = parsedUrl.hostname; + } + callback(null, [citation]); } - - try{ - $ = cheerio.load(html); - } - catch (e){ - console.log('Could not load document: ' + e); - callback(error, [citation]); - } - - citation.title = getTitle(); - - // Access date on format YYYY-MM-DD - citation.accessDate = (new Date()).toISOString().substring(0, 10); - - var parsedUrl = response.request.uri ? response.request.uri : urlParse.parse(url); - - if (citation.title && parsedUrl && parsedUrl.hostname) { - citation.publicationTitle = parsedUrl.hostname; - } - - callback(error, [citation]); }); }; diff --git a/package.json b/package.json index dfdd7e9..a35cd58 100644 --- a/package.json +++ b/package.json @@ -9,6 +9,7 @@ "bunyan" : "1.2.3", "cheerio" : "0.18.0", "express" : "4.10.4", + "open-graph" : "git://github.com/mvolz/node-open-graph.git", "path": "0.4.9", "request" : "2.49.0", "xmldom" : "0.1.19", -- To view, visit https://gerrit.wikimedia.org/r/182975 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I8d571305e853f41748ade494f52430a44e11bf75 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/citoid Gerrit-Branch: master Gerrit-Owner: Mvolz <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
