Mvolz has uploaded a new change for review.
https://gerrit.wikimedia.org/r/182972
Change subject: [WIP] Open Graph support
......................................................................
[WIP] Open Graph support
Change-Id: Ibebc899ebd35e194ad32bb974a8c27b44a3427f5
---
M lib/distinguish.js
M lib/scrape.js
M package.json
M server.js
4 files changed, 61 insertions(+), 61 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/citoid
refs/changes/72/182972/1
diff --git a/lib/distinguish.js b/lib/distinguish.js
index aaab664..9205013 100644
--- a/lib/distinguish.js
+++ b/lib/distinguish.js
@@ -10,8 +10,7 @@
var request = require('request'),
urlParse = require('url'),
requestFromURL = require('./requests.js').requestFromURL,
- requestFromDOI = require('./requests.js').requestFromDOI,
- requestFromPubMedID = require('./requests.js').requestFromPubMedID;
+ requestFromDOI = require('./requests.js').requestFromDOI;
/**
* Determine type of string (doi, url) and callback on correct handler
@@ -19,40 +18,25 @@
* @param {Function} callback callback(extractedValue, correctFunction)
*/
var distinguish = function (searchString, callback){
- var search, reDOI, rePMID, rePMCID, rePMCID2, parsedURL,
- matchDOI, matchPMID, matchPMCID;
+ var search, match, reDOI, parsedURL;
searchString.trim();
search = searchString;
reDOI = new RegExp('\\b10[.][0-9]{4,}[//].*\\b');
- rePMID = new RegExp('\\b\\d{8}\\b');
- rePMCID = new RegExp('\\bPMC\\d{7}\\b');
- rePMCID2 = new RegExp('\\b\\d{7}\\b');
- matchDOI = search.match(reDOI);
- matchPMID = search.match(rePMID);
- matchPMCID = search.match(rePMCID);
+ match = search.match(reDOI);
- if (matchDOI) {
- callback(matchDOI[0], requestFromDOI);
- } else if (matchPMID) {
- callback(matchPMID[0], requestFromPubMedID);
- } else if (matchPMCID) {
- callback(matchPMCID[0], requestFromPubMedID);
+ if (match){
+ callback(match[0], requestFromDOI);
} else {
- matchPMCID = search.match(rePMCID2);
- if (matchPMCID) {
- callback('PMC' + matchPMCID[0], requestFromPubMedID);
+ parsedURL = urlParse.parse(search);
+ if (!parsedURL.protocol){
+ search = 'http://'+ search;
+ callback(search, requestFromURL);
} else {
- parsedURL = urlParse.parse(search);
- if (!parsedURL.protocol){
- search = 'http://'+ search;
- callback(search, requestFromURL);
- } else {
- callback(search, requestFromURL); //assume url
if not doi
- }
+ callback(search, requestFromURL); //assume url if not
doi
}
}
};
@@ -68,4 +52,4 @@
/* Exports */
module.exports = {
distinguish: distinguish
-};
+};
\ No newline at end of file
diff --git a/lib/scrape.js b/lib/scrape.js
index ae07062..5d40a2b 100644
--- a/lib/scrape.js
+++ b/lib/scrape.js
@@ -3,10 +3,17 @@
* https://www.mediawiki.org/wiki/citoid
*/
-/* Import Modules */
+/*
+ Module dependencies
+*/
+
var request = require('request'),
urlParse = require('url'),
- cheerio = require('cheerio');
+ cheerio = require('cheerio'),
+ og = require('open-graph'),
+ parseOG = require('open-graph').parse,
+ bunyan = require('bunyan'),
+ log = bunyan.createLogger({name: "citoid"});
/**
* Currently scrapes title only
@@ -38,37 +45,47 @@
request(
{
url: url,
- followAllRedirects: true
+ followAllRedirects: true,
+ headers: {'user-agent': 'Mozilla/5.0'},
}, function(error, response, html){
+ var ogData,
+ citation = {itemType: 'webpage', url: url,
title: url};
- var citation = {itemType: 'webpage', url: url, title:
url};
-
- if (error || !response) {
- console.log(error);
+ if (error){
callback(error, [citation]);
- return;
+ } else {
+
+ try {
+ $ = cheerio.load(html);
+ } catch (e){
+ console.log('Could not load document: '
+ e);
+ callback(null, [citation]);
+ return;
+ }
+
+ ogData = parseOG(html);
+ console.log(ogData);
+
+ //Title
+ if (ogData.title){
+ citation.title = ogData.title;
+ } else {
+ citation.title = getTitle();
+ }
+
+ // Access date - format YYYY-MM-DD
+ citation.accessDate = (new
Date()).toISOString().substring(0, 10);
+
+ //Publication title
+ var parsedUrl = urlParse.parse(url);
+
+ if (ogData.site_name){
+ citation.publicationTitle =
ogData.site_name;
+ } else if (citation.title && parsedUrl &&
parsedUrl.hostname) {
+ citation.publicationTitle =
parsedUrl.hostname;
+ }
+ callback(null, [citation]);
}
-
- try{
- $ = cheerio.load(html);
- }
- catch (e){
- console.log('Could not load document: ' + e);
- callback(error, [citation]);
- }
-
- citation.title = getTitle();
-
- // Access date on format YYYY-MM-DD
- citation.accessDate = (new
Date()).toISOString().substring(0, 10);
-
- var parsedUrl = response.request.uri ?
response.request.uri : urlParse.parse(url);
-
- if (citation.title && parsedUrl && parsedUrl.hostname) {
- citation.publicationTitle = parsedUrl.hostname;
- }
-
- callback(error, [citation]);
});
};
diff --git a/package.json b/package.json
index dfdd7e9..a35cd58 100644
--- a/package.json
+++ b/package.json
@@ -9,6 +9,7 @@
"bunyan" : "1.2.3",
"cheerio" : "0.18.0",
"express" : "4.10.4",
+ "open-graph" : "git://github.com/mvolz/node-open-graph.git",
"path": "0.4.9",
"request" : "2.49.0",
"xmldom" : "0.1.19",
diff --git a/server.js b/server.js
index 4af484e..41f5961 100644
--- a/server.js
+++ b/server.js
@@ -23,7 +23,7 @@
var distinguish = require('./lib/distinguish.js').distinguish,
requestFromURL = require('./lib/requests.js').requestFromURL;
-/* Import Local Settings */
+/* Import Local Settings*/
var settingsFile = path.resolve(process.cwd(), argv.c),
CitoidConfig = require(settingsFile).CitoidConfig,
citoidPort = CitoidConfig.citoidPort,
@@ -37,11 +37,9 @@
var zoteroURL = util.format('http://%s:%s/%s', zoteroInterface,
zoteroPort.toString());
// Init citoid webserver
-var citoid = express();
-var log = bunyan.createLogger({name: "citoid"});
+var citoid = express(),
+ log = bunyan.createLogger({name: "citoid"});
-// SECURITY WARNING: ALLOWS ALL REQUEST ORIGINS
-// change allowCORS in localsettings.js
citoid.all('*', function(req, res, next) {
res.header("Access-Control-Allow-Origin", allowCORS);
res.header("Access-Control-Allow-Headers", "X-Requested-With, Content-Type");
--
To view, visit https://gerrit.wikimedia.org/r/182972
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ibebc899ebd35e194ad32bb974a8c27b44a3427f5
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/citoid
Gerrit-Branch: master
Gerrit-Owner: Mvolz <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits