I have totally refactored the code. I've removed the node-scraper (it uses
now the request and jsdom modules). Plus I've added some parallel stuff on
sending http requests.
After checking 6000 pages, the RSS amount is about ~370MB.
ps this is very strange that 'window' object is not destroyed by v8's GC,
after exiting the callback function where 'window' is used.
var request = require('request'),
util = require('util'),
jsdom = require('jsdom'),
querystring = require('querystring'),
fs = require('fs');
if( process.argv[2]==undefined ) {
util.error("ERROR: You need to pass a URL");
process.exit(1);
}
var checkDomain = process.argv[2].replace("https://",
"").replace("http://", ""),
links = [process.argv[2]],
links_grabbed = [],
link;
util.log("Start scraping...");
setInterval(function()
{
if( links.length>0 )
{
link = links.pop();
if( links_grabbed[link] !== true )
links_grabbed[link] = true;
request({
uri: link
}, function (err, response, body) {
if (err || response.statusCode !== 200) {
console.log('Request error: %s [%s]', link,
response.statusCode);
return;
}
jsdom.env({
html: body,
scripts: ['http://code.jquery.com/jquery-1.6.min.js']
}, function (err, window) {
var jLinks = window.jQuery("body").find("a");
jLinks.each(function(i, item)
{
var lnk = window.jQuery(item).attr("href");
if( lnk.indexOf("/")==0 )
lnk = "http://" + checkDomain + lnk;
if( links.indexOf(lnk)==-1 && links_grabbed[lnk]!==true && ["#",
""].indexOf(lnk)==-1 && (lnk.indexOf("http://" + checkDomain)==0 ||
lnk.indexOf("https://"+checkDomain)==0) )
links.push( lnk );
});
window.close();
process.stdout.write("\r" + " Grabbed: "+ Object.keys(links_grabbed).length
+" / Need to check: " + links.length + " [" +
bytesToSize(process.memoryUsage().rss) + "]");
});
});
}
}, 10);
On Monday, July 2, 2012 5:00:30 PM UTC+3, tim sebastian wrote:
>
> do you heavily rely on node-scraper? or can you use pure jsdom? Not sure
> where it leaks though, but didnt see much of a memory usage after closing
> the windows with jsdom.
>
> On Mon, Jul 2, 2012 at 3:51 PM, ec.developer <[email protected]>wrote:
>
>> Ahhh, brilliant! Thank you. window.close() - minimized significantly the
>> memory usage. But it still leaks. Before closing the window I was able to
>> check ~1000 pages. Now I can check over 10000 pages, but after a while I
>> got again the memory allocation error.
>>
>>
>> On Monday, July 2, 2012 4:32:14 PM UTC+3, tim sebastian wrote:
>>>
>>> https://github.com/tmpvar/**jsdom#how-it-works<https://github.com/tmpvar/jsdom#how-it-works>
>>>
>>> jsdom.env(html, function(`errors`, `window`) {
>>> // free memory associated with the window
>>> window.close();
>>> });
>>>
>>>
>>> On Mon, Jul 2, 2012 at 3:30 PM, tim sebastian <
>>> [email protected]> wrote:
>>>
>>>> node-scraper doesnt seem to be closing the jsdom window it creates.
>>>> And honestly dont see a way to do so expect you play around with the
>>>> node-scraper module yourself to fix this issue.
>>>>
>>>> Not even sure if that is the problem, but i had a similar issue working
>>>> with plain jsdom, and not closing the "window" that contains the whole
>>>> DOM-Tree was the reason.
>>>>
>>>> On Mon, Jul 2, 2012 at 3:08 PM, ec.developer <[email protected]>wrote:
>>>>
>>>>> Hi all,
>>>>> I've created a small app, which searches for Not Found [404]
>>>>> exceptions on a specified website. I use the node-scraper module (
>>>>> https://github.com/mape/node-**scraper/<https://github.com/mape/node-scraper/>),
>>>>>
>>>>> which uses native node's request module and jsdom for parsing the html).
>>>>> My app recursively searches for links on the each webpage, and then
>>>>> calls the Scraping stuff for each found link. The problem is that after
>>>>> scanning 100 pages (and collecting over 200 links to be scanned) the RSS
>>>>> memory usage is >200MB (and it still increases on each iteration). So
>>>>> after
>>>>> scanning over 300-400 pages, I got memory allocation error.
>>>>> The code is provided below.
>>>>> Any hints?
>>>>>
>>>>> var scraper = require('scraper'),
>>>>> util = require('util');
>>>>>
>>>>> var checkDomain = process.argv[2].replace("**https://",
>>>>> "").replace("http://", ""),
>>>>> links = [process.argv[2]],
>>>>> links_grabbed = [];
>>>>>
>>>>> var link_check = links.pop();
>>>>> links_grabbed.push(link_check)**;
>>>>> scraper(link_check, parseData);
>>>>>
>>>>> function parseData(err, jQuery, url)
>>>>> {
>>>>> var ramUsage = bytesToSize(process.**memoryUsage().rss);
>>>>> process.stdout.write("\rLinks checked: " + (Object.keys(links_grabbed).
>>>>> **length) + "/" + links.length + " ["+ ramUsage +"] ");
>>>>>
>>>>> if( err ) {
>>>>> console.log("%s [%s], source - %s", err.uri, err.http_status,
>>>>> links_grabbed[err.uri].src);
>>>>> }
>>>>> else {
>>>>> jQuery('a').each(function() {
>>>>> var link = jQuery(this).attr("href").**trim();
>>>>>
>>>>> if( link.indexOf("/")==0 )
>>>>> link = "http://" + checkDomain + link;
>>>>>
>>>>> if( links.indexOf(link)==-1 && links_grabbed.indexOf(link)==-**1 &&
>>>>> ["#", ""].indexOf(link)==-1 && (link.indexOf("http://" + checkDomain)==0
>>>>> ||
>>>>> link.indexOf("https://"+**checkDomain)==0) )
>>>>> links.push(link);
>>>>> });
>>>>> }
>>>>>
>>>>> if( links.length>0 ) {
>>>>> var link_check = links.pop();
>>>>> links_grabbed.push(link_check)**;
>>>>> scraper(link_check, parseData);
>>>>> }
>>>>> else {
>>>>> util.log("Scraping is done. Bye bye =)");
>>>>> process.exit(0);
>>>>> }
>>>>> }
>>>>>
>>>>> --
>>>>> Job Board: http://jobs.nodejs.org/
>>>>> Posting guidelines: https://github.com/joyent/**
>>>>> node/wiki/Mailing-List-**Posting-Guidelines<https://github.com/joyent/node/wiki/Mailing-List-Posting-Guidelines>
>>>>> You received this message because you are subscribed to the Google
>>>>> Groups "nodejs" group.
>>>>> To post to this group, send email to [email protected]
>>>>> To unsubscribe from this group, send email to
>>>>> nodejs+unsubscribe@**googlegroups.com<nodejs%[email protected]>
>>>>> For more options, visit this group at
>>>>> http://groups.google.com/**group/nodejs?hl=en?hl=en<http://groups.google.com/group/nodejs?hl=en?hl=en>
>>>>>
>>>>
>>>>
>>> --
>> Job Board: http://jobs.nodejs.org/
>> Posting guidelines:
>> https://github.com/joyent/node/wiki/Mailing-List-Posting-Guidelines
>> You received this message because you are subscribed to the Google
>> Groups "nodejs" group.
>> To post to this group, send email to [email protected]
>> To unsubscribe from this group, send email to
>> [email protected]
>> For more options, visit this group at
>> http://groups.google.com/group/nodejs?hl=en?hl=en
>>
>
>
--
Job Board: http://jobs.nodejs.org/
Posting guidelines:
https://github.com/joyent/node/wiki/Mailing-List-Posting-Guidelines
You received this message because you are subscribed to the Google
Groups "nodejs" group.
To post to this group, send email to [email protected]
To unsubscribe from this group, send email to
[email protected]
For more options, visit this group at
http://groups.google.com/group/nodejs?hl=en?hl=en