Hi all, 
I've created a small app, which searches for Not Found [404] exceptions on 
a specified website. I use the node-scraper module (
https://github.com/mape/node-scraper/), which uses native node's request 
module and jsdom for parsing the html). 
My app recursively searches for links on the each webpage, and then calls 
the Scraping stuff for each found link. The problem is that after scanning 
100 pages (and collecting over 200 links to be scanned) the RSS memory 
usage is >200MB (and it still increases on each iteration). So after 
scanning over 300-400 pages, I got memory allocation error. 
The code is provided below. 
Any hints? 

var scraper = require('scraper'),
util = require('util');

var checkDomain = process.argv[2].replace("https://";, 
"").replace("http://";, ""),
links = [process.argv[2]],
links_grabbed = [];

var link_check = links.pop();
links_grabbed.push(link_check);
scraper(link_check, parseData);

function parseData(err, jQuery, url)
{
var ramUsage = bytesToSize(process.memoryUsage().rss);
process.stdout.write("\rLinks checked: " + 
(Object.keys(links_grabbed).length) + "/" + links.length + " ["+ ramUsage 
+"] ");

if( err ) {
console.log("%s [%s], source - %s", err.uri, err.http_status, 
links_grabbed[err.uri].src);
}
else {
jQuery('a').each(function() {
var link = jQuery(this).attr("href").trim();

if( link.indexOf("/")==0 )
link = "http://"; + checkDomain + link;

if( links.indexOf(link)==-1 && links_grabbed.indexOf(link)==-1 && ["#", 
""].indexOf(link)==-1 && (link.indexOf("http://"; + checkDomain)==0 || 
link.indexOf("https://"+checkDomain)==0) ) 
links.push(link);
});
}

if( links.length>0 ) {
var link_check = links.pop();
links_grabbed.push(link_check);
scraper(link_check, parseData);
}
else {
util.log("Scraping is done. Bye bye =)");
process.exit(0);
}
}

-- 
Job Board: http://jobs.nodejs.org/
Posting guidelines: 
https://github.com/joyent/node/wiki/Mailing-List-Posting-Guidelines
You received this message because you are subscribed to the Google
Groups "nodejs" group.
To post to this group, send email to [email protected]
To unsubscribe from this group, send email to
[email protected]
For more options, visit this group at
http://groups.google.com/group/nodejs?hl=en?hl=en

Reply via email to