用Perl脚本来抓去 天涯论坛 的帖子,呵呵 看帖子方便多了[网络爬虫]

2016-08-03 13:47:22来源:oschina作者:08048000人点击

功能分析:


给定一个 天涯论坛 的帖子URL链接地址,获取从当前地址开始的所有整个 主题帖子的内容。


可以选定包含 非楼主回复内容


1, 先看看天涯论坛帖子中 我们所关心的文档结构:



xxx
标题



点击数 回复数








作者发表日期





作者发表日期




2,用HTML::TreeBuilder,LWP::Simple,HTML::Element,Encode库 来实现Reference :parent()_or_$h->parent($new_parent)" rel="nofollow">CPAN


3,一个文件crawl.pl,使用实例:


Perl crawl.plhttp://www.tianya.cn/publicforum/content/funinfo/1/2854941.shtml >ylbg.txt


Perl crawl.plhttp://www.tianya.cn/publicforum/content/no05/1/204253.shtml> yslt.txt


4, crawl.pl文件源代码,呵呵,写的比较挫 还挺见谅啊 下载


【注意 引用处的 $$ 被CSDN替换为$了,CSDN编辑器太烂,都不支持Perl】

use strict;
use warnings;
use HTML::TreeBuilder;
use LWP::Simple;
use Encode;
no utf8; #pragma tells Perl to switch back to treating the source text as literal bytes in the current lexical scope.
#check to print the content of the others
# if 1, print the others, else 0, not to print
my $other=1;
$other=0 if (@ARGV > 1);
my ($url)=@ARGV;
if (@ARGV <1)
{
print "Input an URL/n";
exit;
}
# http://www.baidu.com
if($url !~/^http:/)
{
$url="http://$url";
}my ($pagewrap,
$title,
$auther,#auther
$clicks,#click number
$totalPage,
@idArticleslist,#all Articile ID list
$root#HTML document root element
);
&PrintPage($url);
sub InitRootfromURl()
{
my ($url)=@_;
my $content=get($url);
#print "Get HTML document/n";
die "Couldn't get the URL $url!" unless defined $content;
$root = HTML::TreeBuilder->new_from_content($content);
$root->eof( );# done parsing for this tree
}
sub PrintPage()
{
#http://www.tianya.cn/publicforum/content/feeling/1/1323274.shtml
#urlPattern is :
#http://www.tianya.cn/publicforum/content/feeling/1
#posfitx is:
#shtmlmy ($link)=@_;
my ( $urlPattern,#link pattern
$posfitx,#the link postfix
$CurrID #Input URL'ID
);
my @tmp=split(////,$link);
$urlPattern=join('/',@tmp[0..@tmp-2]);
($CurrID,$posfitx)=split(//./,$tmp[@tmp-1]);
@tmp=();

&InitRootfromURl($link);
&InitFromfirstLink(/$root);
return unless ($idArticleslist[0]);
if ($CurrID ne $idArticleslist[0])
{
$link="$urlPattern/$idArticleslist[0].$posfitx";
&InitRootfromURl($link);
&InitFromfirstLink(/$root);
$root->delete(); #the first page will be deleted.
} {#This infor below got from the first page.
print "title:";&toGBK(/$title) if ($title);
print "/nauther:";&toGBK(/$auther) if ($auther);
print "/nclicks:";&toGBK(/$clicks) if ($clicks);
if ($totalPage){ print "/n";&toGBK(/$totalPage); print "/n";}
}

#Print the content infor of the all posts
print "正文:/n";

my ($i,$start)=(0,0);
foreach my $id (@idArticleslist)
{
$i++;
$start=1 if ( !$start && ($id eq $CurrID));

if($start)
{
print "/nPage $i/n";
$link="$urlPattern/$id.$posfitx";
&InitRootfromURl($link);
&ParseAllPost(/$root);
$root->delete();;# destroy the document
}
}
}
sub toGBK()
{
my ($content)=@_;
$$content=~ s/牋牋//g;
#print $$content;return 0; #If your OS is English ,please comment it
my @chars=split(//,$$content);
foreach my $char (@chars)
{
print encode("gbk",$char);
}
}
sub printContent()
{
my ($content)=@_;
$$content =~ s//s+$//g;
$$content =~ s/TML::Element=.+$//;
&toGBK($content);
}
sub printReplier()
{
my ($writer)=@_;
return if ($$writer =~ /^/s*$/);
$$writer =~ s//s+//;
my $dil="-" x length(Encode::encode_utf8($$writer));
print"/n$dil/n|";
&toGBK($writer);
print"|/n$dil/n";
}
#=====
#print content info
sub ParsePost()
{
my ($node)=@_;
foreach my $n ($$node->content_list())
{
if ( $n =~ /^HTML::Element=/)
{
next if (! (my $tag=$n->tag()));
#print "$tag/n";
if ( $tag eq 'br'){print "/n"; next;}
next if ( ($tag eq 'div') && ($n->attr('class')) && ($n->attr('class') eq 'post-jb'));
next if ( $tag eq 'img');
next if ( $tag eq 'span');
next if ( $tag eq 'u');
&ParsePost(/$n);
}
{
&printContent(/$n);
}
}
}
#Parse allpost
#=====
#print content info
sub ParseAllPost()
{
my ($node)=@_; #allpost,HTML::Element
my ($tag,$attr,@poster,$writer,$isShow);return unless ($$node);my @a=$$node->look_down(_tag=>'div',class=>'allpost',id=>'pContentDiv');
return unless ($a[0]);
$isShow=1;
foreach my $n ($a[0]->content_list())
{
next if ( $n !~ /^HTML::Element=/);
next if (! ($tag=$n->tag()));
#parse the content of writer
if ( $isShow && ($tag eq 'div'))
{
next if (!($attr=$n->attr('class')));
next if ($attr ne 'post');
&ParsePost(/$n);
}
#parse the writer
if ( $tag eq 'table')
{
@poster = $n->look_down(_tag=>'a',target=>'_blank');
$writer=$poster[0]->as_text();

if (!$other)
{
if ($writer eq $auther){$isShow =1;}
else {$isShow = 0;}
}
if ($isShow)
{
&printReplier(/$writer);
}
}
}
#$$node->delete();#Delete all elements
}
#Initialize the auther info and link info
#Parse pagewrap and, find title, auther ,and all links
sub InitFromfirstLink()
{
my ($node)=@_;
my ($tag,$attr,@tmp,@a,$b);
#find the 'pagewrap' tag
return unless ($$node);
@tmp = $$node->look_down(_tag=>'div',class=>'pagewrap');return unless ($tmp[0]);
foreach my $n ($tmp[0]->content_list())
{
next if ( $n !~ /^HTML::Element=/);
next if (! ($tag=$n->tag()));
if ( $tag eq 'h1')
{
@tmp = $n->look_down(_tag=>'span');
next unless ($tmp[1]);
$title=$tmp[1]->as_text();
@tmp=();
next;
}
if ( $tag eq 'table')
{
@tmp = $n->look_down(_tag=>'a');
next unless ($tmp[0]);
$auther=$tmp[0]->as_text();
@tmp=();
next;
}
next if (!($attr=$n->attr('class')));
if ( ($tag eq 'div') && ( $attr eq 'function'))
{
@tmp = $n->content_list();

{
@a=();
@a=$n->look_down(_tag=>'span');
if ( @a > 0)
{$totalPage=$a[0]->as_text();}
@a=();
}

if ($tmp[0]->attr('class') eq 'info')
{
@a=split(/ /,$tmp[0]->as_text());
$clicks=$a[1];
@a=();
}else{
#print "error:",$tmp[0]->dump(); next;
}

{#getidArticleslist
@a=();
@a=$n->look_down(_tag=>'input',name=>'idArticleslist');
if ( @a > 0){
$b=$a[0]->attr('value');
@idArticleslist=split(/,/,$b);
}
@a=();
}
#print $tmp[1]->dump();
@tmp=();
next;
}
if ( ($tag eq 'div') && ( $attr eq 'allpost'))
{
#$$node->detach;
last;
}
}
}=cut
天涯帖子结构


xxx
标题



点击数 回复数








作者发表日期





作者发表日期





=cut

最新文章

123

最新摄影

微信扫一扫

第七城市微信公众平台