豆瓣电影信息爬取并保存到excel

2017-01-14 08:42:44来源:CSDN作者:chuan_yu_chuan人点击

爬取地址: https://www.douban.com/doulist/3936288/?start=0
爬取豆瓣电影Top250,并将电影名称,导演,演员,时间等信息记录到excel中。

import reimport openpyxlimport requestsfrom bs4 import BeautifulSoupclass Movie(object):    def __init__(self, title, href):        self.title = title        self.href = href    def set_director(self, director):        self.director = director    def get_director(self):        return self.director    def set_actor(self, actor):        self.actor = actor    def get_actor(self, ):        return self.actor    def set_type(self, type):        self.type = type    def get_type(self):        return self.type    def set_region(self, region):        self.region = region    def get_region(self):        return self.region    def set_year(self, year):        self.year = year    def get_year(self):        return self.yearurl = "https://www.douban.com/doulist/3936288/?start=%s"urls = []pages = 10for i in range(pages):    urls.append(url % (i * 25))titles = []urlElems = []lists = []for index in range(pages):    res = requests.get(urls[index])    soup = BeautifulSoup(str(res.content, "utf-8"), 'html.parser')    urlElems.extend(soup.select('.doulist-subject'))for i in range(len(urlElems)):    strurl = urlElems[i].select('.title a')    title_content = re.findall(re.compile('target="_blank">/s+(.*)/s+</a>'), str(strurl))[0]    href_content = re.findall(re.compile('href="(.*)?"/s'), str(strurl))[0]    str_abstract = urlElems[i].select('.abstract')[0]    str_list = str_abstract.get_text().split("/n")    movie = Movie(title_content, href_content)    for ss in str_list:        movie.director = str_list[2]        movie.actor = str_list[4]        movie.type = str_list[6]        movie.region = str_list[8]        movie.year = str_list[10]    lists.append(movie)wb = openpyxl.Workbook()sheet = wb.get_active_sheet()for i in range(len(lists)):    movie = lists[i]    sheet.cell(row=i + 1, column=1).value = movie.title    sheet.cell(row=i + 1, column=2).value = movie.href    sheet.cell(row=i + 1, column=3).value = movie.director    sheet.cell(row=i + 1, column=4).value = movie.actor    sheet.cell(row=i + 1, column=5).value = movie.type    sheet.cell(row=i + 1, column=6).value = movie.region    sheet.cell(row=i + 1, column=7).value = movie.yearwb.save("douban.xlsx")print("ok")

最新文章

123

最新摄影

微信扫一扫

第七城市微信公众平台