Re: [nodejs] Web scraping and Memory leaking issue

ec.developer Mon, 02 Jul 2012 07:26:32 -0700

I have totally refactored the code. I've removed the node-scraper (it uses 
now the request and jsdom modules). Plus I've added some parallel stuff on 
sending http requests.
After checking 6000 pages, the RSS amount is about ~370MB.


ps this is very strange that 'window' object is not destroyed by v8's GC, 
after exiting the callback function where 'window' is used. 

var request = require('request'),
util = require('util'),
jsdom = require('jsdom'),
querystring = require('querystring'),
fs = require('fs');

if( process.argv[2]==undefined ) {
util.error("ERROR: You need to pass a URL");
process.exit(1);
}

var checkDomain = process.argv[2].replace("https://";, 
"").replace("http://";, ""),
links = [process.argv[2]],
links_grabbed = [],
link;

util.log("Start scraping...");

setInterval(function()
{
  if( links.length>0 )
{
link = links.pop();
if( links_grabbed[link] !== true )
links_grabbed[link] = true;

request({
        uri: link
    }, function (err, response, body) {
 
        if (err || response.statusCode !== 200) {
            console.log('Request error: %s [%s]', link, 
response.statusCode);
            return;
        }

        jsdom.env({
            html: body,
            scripts: ['http://code.jquery.com/jquery-1.6.min.js']
        }, function (err, window) {

 var jLinks = window.jQuery("body").find("a");

 jLinks.each(function(i, item)
 {
  var lnk = window.jQuery(item).attr("href");
  if( lnk.indexOf("/")==0 )
lnk = "http://"; + checkDomain + lnk;

 if( links.indexOf(lnk)==-1 && links_grabbed[lnk]!==true && ["#", 
""].indexOf(lnk)==-1 && (lnk.indexOf("http://"; + checkDomain)==0 || 
lnk.indexOf("https://"+checkDomain)==0) )
 links.push( lnk );
 });

 window.close();
process.stdout.write("\r" + " Grabbed: "+ Object.keys(links_grabbed).length 
+" / Need to check: " + links.length + " [" + 
bytesToSize(process.memoryUsage().rss) + "]");
        });
    });
}

}, 10);

On Monday, July 2, 2012 5:00:30 PM UTC+3, tim sebastian wrote:
>
> do you heavily rely on node-scraper? or can you use pure jsdom? Not sure 
> where it leaks though, but didnt see much of a memory usage after closing 
> the windows with jsdom.
>
> On Mon, Jul 2, 2012 at 3:51 PM, ec.developer <[email protected]>wrote:
>
>> Ahhh, brilliant! Thank you. window.close() - minimized significantly the 
>> memory usage. But it still leaks. Before closing the window I was able to 
>> check ~1000 pages. Now I can check over 10000 pages, but after a while I 
>> got again the memory allocation error.
>>
>>
>> On Monday, July 2, 2012 4:32:14 PM UTC+3, tim sebastian wrote:
>>>
>>> https://github.com/tmpvar/**jsdom#how-it-works<https://github.com/tmpvar/jsdom#how-it-works>
>>>
>>> jsdom.env(html, function(`errors`, `window`) {
>>>   // free memory associated with the window
>>>   window.close();
>>> });
>>>
>>>
>>> On Mon, Jul 2, 2012 at 3:30 PM, tim sebastian <
>>> [email protected]> wrote:
>>>
>>>> node-scraper doesnt seem to be closing the jsdom window it creates.
>>>> And honestly dont see a way to do so expect you play around with the 
>>>> node-scraper module yourself to fix this issue.
>>>>
>>>> Not even sure if that is the problem, but i had a similar issue working 
>>>> with plain jsdom, and not closing the "window" that contains the whole 
>>>> DOM-Tree was the reason.
>>>>  
>>>> On Mon, Jul 2, 2012 at 3:08 PM, ec.developer <[email protected]>wrote:
>>>>
>>>>> Hi all, 
>>>>> I've created a small app, which searches for Not Found [404] 
>>>>> exceptions on a specified website. I use the node-scraper module (
>>>>> https://github.com/mape/node-**scraper/<https://github.com/mape/node-scraper/>),
>>>>>  
>>>>> which uses native node's request module and jsdom for parsing the html). 
>>>>> My app recursively searches for links on the each webpage, and then 
>>>>> calls the Scraping stuff for each found link. The problem is that after 
>>>>> scanning 100 pages (and collecting over 200 links to be scanned) the RSS 
>>>>> memory usage is >200MB (and it still increases on each iteration). So 
>>>>> after 
>>>>> scanning over 300-400 pages, I got memory allocation error. 
>>>>> The code is provided below. 
>>>>> Any hints? 
>>>>>
>>>>> var scraper = require('scraper'),
>>>>> util = require('util');
>>>>>
>>>>> var checkDomain = process.argv[2].replace("**https://";, 
>>>>> "").replace("http://";, ""),
>>>>> links = [process.argv[2]],
>>>>>  links_grabbed = [];
>>>>>
>>>>> var link_check = links.pop();
>>>>> links_grabbed.push(link_check)**;
>>>>> scraper(link_check, parseData);
>>>>>
>>>>> function parseData(err, jQuery, url)
>>>>> {
>>>>> var ramUsage = bytesToSize(process.**memoryUsage().rss);
>>>>> process.stdout.write("\rLinks checked: " + (Object.keys(links_grabbed).
>>>>> **length) + "/" + links.length + " ["+ ramUsage +"] ");
>>>>>
>>>>> if( err ) {
>>>>> console.log("%s [%s], source - %s", err.uri, err.http_status, 
>>>>> links_grabbed[err.uri].src);
>>>>>  }
>>>>> else {
>>>>> jQuery('a').each(function() {
>>>>> var link = jQuery(this).attr("href").**trim();
>>>>>
>>>>> if( link.indexOf("/")==0 )
>>>>> link = "http://"; + checkDomain + link;
>>>>>
>>>>>  if( links.indexOf(link)==-1 && links_grabbed.indexOf(link)==-**1 && 
>>>>> ["#", ""].indexOf(link)==-1 && (link.indexOf("http://"; + checkDomain)==0 
>>>>> || 
>>>>> link.indexOf("https://"+**checkDomain)==0) ) 
>>>>>  links.push(link);
>>>>> });
>>>>> }
>>>>>
>>>>> if( links.length>0 ) {
>>>>>  var link_check = links.pop();
>>>>> links_grabbed.push(link_check)**;
>>>>> scraper(link_check, parseData);
>>>>>  }
>>>>> else {
>>>>> util.log("Scraping is done. Bye bye =)");
>>>>>  process.exit(0);
>>>>> }
>>>>> }
>>>>>
>>>>> -- 
>>>>> Job Board: http://jobs.nodejs.org/
>>>>> Posting guidelines: https://github.com/joyent/**
>>>>> node/wiki/Mailing-List-**Posting-Guidelines<https://github.com/joyent/node/wiki/Mailing-List-Posting-Guidelines>
>>>>> You received this message because you are subscribed to the Google
>>>>> Groups "nodejs" group.
>>>>> To post to this group, send email to [email protected]
>>>>> To unsubscribe from this group, send email to
>>>>> nodejs+unsubscribe@**googlegroups.com<nodejs%[email protected]>
>>>>> For more options, visit this group at
>>>>> http://groups.google.com/**group/nodejs?hl=en?hl=en<http://groups.google.com/group/nodejs?hl=en?hl=en>
>>>>>
>>>>
>>>>
>>>  -- 
>> Job Board: http://jobs.nodejs.org/
>> Posting guidelines: 
>> https://github.com/joyent/node/wiki/Mailing-List-Posting-Guidelines
>> You received this message because you are subscribed to the Google
>> Groups "nodejs" group.
>> To post to this group, send email to [email protected]
>> To unsubscribe from this group, send email to
>> [email protected]
>> For more options, visit this group at
>> http://groups.google.com/group/nodejs?hl=en?hl=en
>>
>
>

-- 
Job Board: http://jobs.nodejs.org/
Posting guidelines: 
https://github.com/joyent/node/wiki/Mailing-List-Posting-Guidelines
You received this message because you are subscribed to the Google
Groups "nodejs" group.
To post to this group, send email to [email protected]
To unsubscribe from this group, send email to
[email protected]
For more options, visit this group at
http://groups.google.com/group/nodejs?hl=en?hl=en

Re: [nodejs] Web scraping and Memory leaking issue

Reply via email to