#!/usr/local/ActivePerl-5.10/bin/perl #auto download pictures from http://e.psbbs.info/ #by [email protected] use strict; use warnings;
use threads; use WWW::Mechanize; use POSIX qw(strftime); $| = 1; my $today = strftime "%Y%m%d", localtime; mkdir $today if(not -d $today); my $pager = $ARGV[0] ? $ARGV[0] : 1; #页码,默认为1 my $url = "http://rr.pstang.com/thread.php?fid=39&search=&page=$pager"; my $mech = WWW::Mechanize->new(quite => 1, onerror => undef); my @mechs; #创建4个实例用于多线程下载图片 foreach (0..3) { my $m = WWW::Mechanize->new(quite => 1, onerror => undef, timeout => 180); $m->agent_alias('Windows IE 6'); $m->stack_depth(2); push @mechs, $m; } $mech->agent_alias('Windows IE 6'); $mech->stack_depth(2); print getts(), ", get baseURL '$url' ... "; $mech->get($url); print "done\n"; my $content = $mech->content(); $content =~ s/\n/ /g; $content =~ s/\s{2,}/ /g; $content = $1 if($content =~ m{普通主题(.*)$}); #取出'普通主题'关键字之后的内容 my @imgs; #一个帖子需下载的图片组 my %tids_downloaded; #记录已下载过的帖子的ID open DOWNLOADED_R, "< downloaded.txt"; while(<DOWNLOADED_R>) { chomp; $tids_downloaded{$_} = 1; } close DOWNLOADED_R; open DOWNLOADED_W, ">> downloaded.txt"; while($content =~ m{<tr align=center class=t_two>.*?<a href='read.php\?tid=(\d+)' id=''>}ig) { my $thread_id = $1; my $thread_url = "http://rr.pstang.com/read.php?tid=$thread_id"; print "\t", getts(), ", get threadURL '$thread_url' ... "; $mech->get($thread_url); print "done\n"; my $t_content = $mech->content(); my $uid; $t_content =~ s/\n/ /g; $t_content =~ s/\s{2,}/ /g; if($t_content =~ m{本主题帖为未审核新贴}) { #忽略未审核新帖 print "\tskipped\n"; next; } if(-d "$thread_id") { #忽略先前下载过的帖子 print "\tdownloaded\n"; next; } if($t_content =~ m{<font face=Gulim><b>(.*?)</b></font>}) { $uid = $1; $uid =~ s/\s//g; } my $thread_dir = "$uid - $thread_id"; mkdir("$today/$thread_dir"); #为新帖创建单独目录 #...@task_groups:按10个实例进行分组,为每个实例分配任务(要下载的图片) #...@threads:线程数组 my (@task_groups, @threads); foreach my $img ($mech->find_all_images()) { my $img_url = $img->url(); my @fs = split /\//, $img_url; my $img_name = $fs[-1]; if($img_name =~ m{\.jpg$}i) { #只需要JPG图片 next if($img_name =~ m{luo.*?\.jpg$}i); #过滤不需要的论坛图片 push @imgs, [$img_url, "$today/$thread_dir/$img_name"]; #图片URL地址及本地保存路径 } } foreach my $i (0..$#imgs) { my $gid = $i - int($i/4)*4; #$i mod 4 push @{$task_groups[$gid]}, $i; } foreach my $gid (0..$#task_groups) { my $t = threads->create('get_image', $mechs[$gid], $task_groups[$gid]); push @threads, $t; } while(my $t = shift @threads) { $t->join; } #join每个线程 @imgs = (); #处理下一个帖子前清...@imgs print DOWNLOADED_W "$thread_id\n"; $tids_downloaded{$thread_id} = 1; sleep(2); #be polite } close(DOWNLOADED_W); sub getts { return strftime "%H:%M:%S", localtime; } sub get_image { my ($mech, $task_group) = @_; foreach my $img_id (@{$task_group}) { my ($img_url, $save_path) = @{$imgs[$img_id]}; $mech->get($img_url, ':content_file' => $save_path); print "\t\t", getts(), ", got img '$img_url'\n"; } }
你为浏览论坛看图不便而郁闷吗?贡献一个邪恶的脚本,呵呵
