#!/usr/bin/perl

#extracteur de meta "Keyword" 

use strict;
use HTML::TreeBuilder 2.97;
use LWP::UserAgent;
use Mysql;

sub get_headlines {
    my $url = $_[0] || die "What URL?";
    my $ua=LWP::UserAgent->new;
    $ua->timeout(45);
    my $response =$ua->request(
      HTTP::Request->new( GET => $url )
    );
    unless($response->is_success) {
      warn "[",scalar(localtime()),"] : Couldn't get $url: ", $response->status_line, "\n";
      return;
    }
    my $tree = HTML::TreeBuilder->new();
    $tree->parse($response->content);
    $tree->eof;
	my(@ret)=();
	my $res= $tree->look_down('_tag', 'meta',\&find);
	if($res){
		@ret=split(',',$res->attr('content'));}
	else {
		@ret=('');}
	$tree->delete;
	return @ret;
}

sub find {
        return unless $_[0]->attr('name') eq 'keywords'
}

######### MAIN
$|=1; #Active l'autoflush
my $host="";
my $database="mydb";
my $user="myuser";
my $password='mypasswd';

my $limit=$ARGV[0];

my $cpt=1;
print "Start [",scalar(localtime()),"]\n";
my $stime=time();
my $dbh_mysql=Mysql->Connect($host,$database,$user,$password) or die ("probleme de connexion");
my $url_query = "select si_numero,si_url from site left join meta on site.si_numero=meta.me_site where me_site is null 
 and (-to_days(si_lst_meta_run)+to_days(now())>7 or si_lst_meta_run is null)";
if (defined($limit)) {
	$url_query.=" limit $limit";
#	print $url_query;
}	
my $result =Query $dbh_mysql $url_query or die("cht'i probleme de connexion");
my $num_table=$result->numrows;
my(@table)=();

while ($cpt<=$num_table){
        @table=$result->fetchrow;
	print "Site: $cpt/$num_table (Nø $table[0]) keyword=>";
	my (@links)=get_headlines("http://$table[1]");
	my $i=1;
	unless (defined(@links)) {
		my $upd="update site set si_op=\'N\',si_lst_meta_run=curdate() where si_numero=$table[0]";
		my $void=Query $dbh_mysql $upd;
		$cpt++;
		print "\n";
		next;
	}
	print scalar(@links),"\n";
	my $val='';
	foreach $val (@links){
		my $ins='';
		if ($val eq '') {
			 $ins=qq!insert meta values($table[0],$i,null)!;		
		}else{
			$val=quotemeta($val); 
			$ins=qq!insert meta values($table[0],$i,'$val')!;
		}
		my $void=Query $dbh_mysql $ins;
		$i++;
	}	
        $cpt++;
}
my $etime=time();
print "Fin [",scalar(localtime()),"]: soit ",$etime-$stime," sec d'execution\n";
