Mvolz has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/182972

Change subject: [WIP] Open Graph support
......................................................................

[WIP] Open Graph support

Change-Id: Ibebc899ebd35e194ad32bb974a8c27b44a3427f5
---
M lib/distinguish.js
M lib/scrape.js
M package.json
M server.js
4 files changed, 61 insertions(+), 61 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/citoid 
refs/changes/72/182972/1

diff --git a/lib/distinguish.js b/lib/distinguish.js
index aaab664..9205013 100644
--- a/lib/distinguish.js
+++ b/lib/distinguish.js
@@ -10,8 +10,7 @@
 var request = require('request'),
        urlParse = require('url'),
        requestFromURL = require('./requests.js').requestFromURL,
-       requestFromDOI = require('./requests.js').requestFromDOI,
-       requestFromPubMedID = require('./requests.js').requestFromPubMedID;
+       requestFromDOI = require('./requests.js').requestFromDOI;
 
 /**
  * Determine type of string (doi, url) and callback on correct handler
@@ -19,40 +18,25 @@
  * @param  {Function} callback     callback(extractedValue, correctFunction)
  */
 var distinguish = function (searchString, callback){
-       var search, reDOI, rePMID, rePMCID, rePMCID2, parsedURL,
-               matchDOI, matchPMID, matchPMCID;
+       var search, match, reDOI, parsedURL;
 
        searchString.trim();
 
        search = searchString;
 
        reDOI = new RegExp('\\b10[.][0-9]{4,}[//].*\\b');
-       rePMID = new RegExp('\\b\\d{8}\\b');
-       rePMCID = new RegExp('\\bPMC\\d{7}\\b');
-       rePMCID2 = new RegExp('\\b\\d{7}\\b');
 
-       matchDOI = search.match(reDOI);
-       matchPMID = search.match(rePMID);
-       matchPMCID = search.match(rePMCID);
+       match = search.match(reDOI);
 
-       if (matchDOI) {
-               callback(matchDOI[0], requestFromDOI);
-       } else if (matchPMID) {
-               callback(matchPMID[0], requestFromPubMedID);
-       } else if (matchPMCID) {
-               callback(matchPMCID[0], requestFromPubMedID);
+       if (match){
+               callback(match[0], requestFromDOI);
        } else {
-               matchPMCID = search.match(rePMCID2);
-               if (matchPMCID) {
-                       callback('PMC' + matchPMCID[0], requestFromPubMedID);
+               parsedURL = urlParse.parse(search);
+               if (!parsedURL.protocol){
+                       search = 'http://'+ search;
+                       callback(search, requestFromURL);
                } else {
-                       parsedURL = urlParse.parse(search);
-                       if (!parsedURL.protocol){
-                               search = 'http://'+ search;
-                               callback(search, requestFromURL);
-                       } else {
-                               callback(search, requestFromURL); //assume url 
if not doi
-                       }
+                       callback(search, requestFromURL); //assume url if not 
doi
                }
        }
 };
@@ -68,4 +52,4 @@
 /* Exports */
 module.exports = {
        distinguish: distinguish
-};
+};
\ No newline at end of file
diff --git a/lib/scrape.js b/lib/scrape.js
index ae07062..5d40a2b 100644
--- a/lib/scrape.js
+++ b/lib/scrape.js
@@ -3,10 +3,17 @@
  * https://www.mediawiki.org/wiki/citoid
  */
 
-/* Import Modules */
+/*
+  Module dependencies
+*/
+
 var request = require('request'),
        urlParse = require('url'),
-       cheerio = require('cheerio');
+       cheerio = require('cheerio'),
+       og = require('open-graph'),
+       parseOG = require('open-graph').parse,
+       bunyan = require('bunyan'),
+       log = bunyan.createLogger({name: "citoid"});
 
 /**
  * Currently scrapes title only
@@ -38,37 +45,47 @@
        request(
                {
                        url: url,
-                       followAllRedirects: true
+                       followAllRedirects: true,
+                       headers: {'user-agent': 'Mozilla/5.0'},
                }, function(error, response, html){
+                       var ogData,
+                               citation = {itemType: 'webpage', url: url, 
title: url};
 
-                       var citation = {itemType: 'webpage', url: url, title: 
url};
-
-                       if (error || !response) {
-                               console.log(error);
+                       if (error){
                                callback(error, [citation]);
-                               return;
+                       } else {
+
+                               try {
+                                       $ = cheerio.load(html);
+                               } catch (e){
+                                       console.log('Could not load document: ' 
+ e);
+                                       callback(null, [citation]);
+                                       return;
+                               }
+
+                               ogData = parseOG(html);
+                               console.log(ogData);
+
+                               //Title
+                               if (ogData.title){
+                                       citation.title = ogData.title;
+                               } else {
+                                       citation.title = getTitle();
+                               }
+
+                               // Access date - format YYYY-MM-DD
+                               citation.accessDate = (new 
Date()).toISOString().substring(0, 10);
+
+                               //Publication title
+                               var parsedUrl = urlParse.parse(url);
+
+                               if (ogData.site_name){
+                                       citation.publicationTitle = 
ogData.site_name;
+                               } else if (citation.title && parsedUrl && 
parsedUrl.hostname) {
+                                       citation.publicationTitle = 
parsedUrl.hostname;
+                               }
+                               callback(null, [citation]);
                        }
-
-                       try{
-                               $ = cheerio.load(html);
-                       }
-                       catch (e){
-                               console.log('Could not load document: ' + e);
-                               callback(error, [citation]);
-                       }
-
-                       citation.title = getTitle();
-
-                       // Access date on format YYYY-MM-DD
-                       citation.accessDate = (new 
Date()).toISOString().substring(0, 10);
-
-                       var parsedUrl = response.request.uri ? 
response.request.uri : urlParse.parse(url);
-
-                       if (citation.title && parsedUrl && parsedUrl.hostname) {
-                               citation.publicationTitle = parsedUrl.hostname;
-                       }
-
-                       callback(error, [citation]);
        });
 };
 
diff --git a/package.json b/package.json
index dfdd7e9..a35cd58 100644
--- a/package.json
+++ b/package.json
@@ -9,6 +9,7 @@
                "bunyan" : "1.2.3",
                "cheerio" : "0.18.0",
                "express" : "4.10.4",
+               "open-graph" : "git://github.com/mvolz/node-open-graph.git",
                "path": "0.4.9",
                "request" : "2.49.0",
                "xmldom" : "0.1.19",
diff --git a/server.js b/server.js
index 4af484e..41f5961 100644
--- a/server.js
+++ b/server.js
@@ -23,7 +23,7 @@
 var distinguish = require('./lib/distinguish.js').distinguish,
        requestFromURL = require('./lib/requests.js').requestFromURL;
 
-/* Import Local Settings */
+/* Import Local Settings*/
 var settingsFile = path.resolve(process.cwd(), argv.c),
        CitoidConfig = require(settingsFile).CitoidConfig,
        citoidPort = CitoidConfig.citoidPort,
@@ -37,11 +37,9 @@
 var zoteroURL = util.format('http://%s:%s/%s', zoteroInterface, 
zoteroPort.toString());
 
 // Init citoid webserver
-var citoid = express();
-var log = bunyan.createLogger({name: "citoid"});
+var citoid = express(),
+       log = bunyan.createLogger({name: "citoid"});
 
-// SECURITY WARNING: ALLOWS ALL REQUEST ORIGINS
-// change allowCORS in localsettings.js
 citoid.all('*', function(req, res, next) {
   res.header("Access-Control-Allow-Origin", allowCORS);
   res.header("Access-Control-Allow-Headers", "X-Requested-With, Content-Type");

-- 
To view, visit https://gerrit.wikimedia.org/r/182972
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ibebc899ebd35e194ad32bb974a8c27b44a3427f5
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/citoid
Gerrit-Branch: master
Gerrit-Owner: Mvolz <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to