初学爬虫一、获取某分类下所有页地址

2017-01-10 10:04:59来源:oschina作者:杨小杨人点击



```
import requests
from bs4 import BeautifulSoup
import sys
sys.encoding = "utf8"
class PageCatch(object):
def __init__(self,host,shortUrl):
self.host = host; #域名+分类
self.shortUrl = shortUrl;#具体请求页
self.url = host + shortUrl;#完整URL
def __getPageContent(self):
#获取URL内容#
req = requests.get(self.url);
if req.status_code == 200:
req.encoding = "gb2312";
strText = req.text;
return strText;
else:
return "";
def __getMaxPageNumAndUrl(self):
reqContent = self.__getPageContent();
#获取分页地址 分页url 形如 list45_2.html 2为页号#
soup = BeautifulSoup (reqContent,"html.parser");
for ul in soup.select(".plist"):
maxPageNum = ul.select("strong")[0].text;
alink =ul.select("a");
if alink[-1]['href'] == "#":
return int(maxPageNum),alink[1]['href'];
return 0;
def __formatPage(self,pageNum):
#格式化url形如 list45_2.html#
lineBeginSite = self.shortUrl.index("_")+1;
docBeginSite = self.shortUrl.index(".");
return self.shortUrl[:lineBeginSite]+str(pageNum+1)+self.shortUrl[docBeginSite:];
def getBookPageList(self):
#获取书籍每页的URL#
shortPageList = [];
maxPageNum,urlPattern = self.__getMaxPageNumAndUrl();
for i in range(maxPageNum):
shortPageList.append(self.host + self.__formatPage(i));
return shortPageList;
if __name__ == '__main__':
p = PageCatch("http://www.jb51.net/books/","list152_35.html");
shortPageList = p.getBookPageList();
print (shortPageList);```

最新文章

123

最新摄影

微信扫一扫

第七城市微信公众平台