Mvolz has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/182975

Change subject: [WIP] Open graph support
......................................................................

[WIP] Open graph support

Change-Id: I8d571305e853f41748ade494f52430a44e11bf75
---
M lib/scrape.js
M package.json
2 files changed, 47 insertions(+), 29 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/citoid 
refs/changes/75/182975/1

diff --git a/lib/scrape.js b/lib/scrape.js
index ae07062..5d40a2b 100644
--- a/lib/scrape.js
+++ b/lib/scrape.js
@@ -3,10 +3,17 @@
  * https://www.mediawiki.org/wiki/citoid
  */
 
-/* Import Modules */
+/*
+  Module dependencies
+*/
+
 var request = require('request'),
        urlParse = require('url'),
-       cheerio = require('cheerio');
+       cheerio = require('cheerio'),
+       og = require('open-graph'),
+       parseOG = require('open-graph').parse,
+       bunyan = require('bunyan'),
+       log = bunyan.createLogger({name: "citoid"});
 
 /**
  * Currently scrapes title only
@@ -38,37 +45,47 @@
        request(
                {
                        url: url,
-                       followAllRedirects: true
+                       followAllRedirects: true,
+                       headers: {'user-agent': 'Mozilla/5.0'},
                }, function(error, response, html){
+                       var ogData,
+                               citation = {itemType: 'webpage', url: url, 
title: url};
 
-                       var citation = {itemType: 'webpage', url: url, title: 
url};
-
-                       if (error || !response) {
-                               console.log(error);
+                       if (error){
                                callback(error, [citation]);
-                               return;
+                       } else {
+
+                               try {
+                                       $ = cheerio.load(html);
+                               } catch (e){
+                                       console.log('Could not load document: ' 
+ e);
+                                       callback(null, [citation]);
+                                       return;
+                               }
+
+                               ogData = parseOG(html);
+                               console.log(ogData);
+
+                               //Title
+                               if (ogData.title){
+                                       citation.title = ogData.title;
+                               } else {
+                                       citation.title = getTitle();
+                               }
+
+                               // Access date - format YYYY-MM-DD
+                               citation.accessDate = (new 
Date()).toISOString().substring(0, 10);
+
+                               //Publication title
+                               var parsedUrl = urlParse.parse(url);
+
+                               if (ogData.site_name){
+                                       citation.publicationTitle = 
ogData.site_name;
+                               } else if (citation.title && parsedUrl && 
parsedUrl.hostname) {
+                                       citation.publicationTitle = 
parsedUrl.hostname;
+                               }
+                               callback(null, [citation]);
                        }
-
-                       try{
-                               $ = cheerio.load(html);
-                       }
-                       catch (e){
-                               console.log('Could not load document: ' + e);
-                               callback(error, [citation]);
-                       }
-
-                       citation.title = getTitle();
-
-                       // Access date on format YYYY-MM-DD
-                       citation.accessDate = (new 
Date()).toISOString().substring(0, 10);
-
-                       var parsedUrl = response.request.uri ? 
response.request.uri : urlParse.parse(url);
-
-                       if (citation.title && parsedUrl && parsedUrl.hostname) {
-                               citation.publicationTitle = parsedUrl.hostname;
-                       }
-
-                       callback(error, [citation]);
        });
 };
 
diff --git a/package.json b/package.json
index dfdd7e9..a35cd58 100644
--- a/package.json
+++ b/package.json
@@ -9,6 +9,7 @@
                "bunyan" : "1.2.3",
                "cheerio" : "0.18.0",
                "express" : "4.10.4",
+               "open-graph" : "git://github.com/mvolz/node-open-graph.git",
                "path": "0.4.9",
                "request" : "2.49.0",
                "xmldom" : "0.1.19",

-- 
To view, visit https://gerrit.wikimedia.org/r/182975
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I8d571305e853f41748ade494f52430a44e11bf75
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/citoid
Gerrit-Branch: master
Gerrit-Owner: Mvolz <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to