python3 基础爬虫(1)

2017-01-09 14:11:15来源:oschina作者:sky_lion人点击

1.python3.x 基础爬虫实现


#!/usr/bin/env python3


# -*- encoding=utf-8 -*-


"""


示例:访问某网站页面,然后将教师信息都抓取下来作为练习


"""


import re


import socket


from urllib import request


from urllib import error

#setting the socket response timeout


timeout = 10


socket.setdefaulttimeout(timeout)

def http_curl(url, page):


try:


res = request.urlopen(url)


except error.HTTPError as e:


print('the server can/'t fulfill the request')


print('Error code',e.code,'Error reason:',e.reason)


except error.URLError as e:


print('can/'t reach the server')


print('Error reason',e.reason)


else:


return res.read().decode('utf-8')


return res


def re_pattern(pattern,content):


regex_pattern = re.compile(pattern)


str = re.findall(regex_pattern,content)


return str


if __name__ == "__main__":


url_format = "xxx"


pattern = r"class=/"t3out/".*?title=/"([/w|_| ]+?)/".*?class=/"color66/"span.*?//span(.*?)//p.*?//li"


fd = open('teacher_info.txt', 'w')


for i in range(1, 27):


url = url_format.format(i)


curl_str = http_curl(url, i)


#print(curl_str)


teacher_info = re_pattern(pattern, curl_str.replace("<", "").replace(">", "").replace("/n", ""))


#teacher_info = curl_str.replace("<", "").replace(">", "").replace("/n", "")


#print(teacher_info)


for i in teacher_info:


#print(i[0], i[1])


file_format = "姓名:{0},简介:{1}/n".format(i[0],i[1])


fd.write(file_format)


fd.write("/n")


fd.close()


总结:


1.这种方法适合基础入门学习,并未涉及到cookie处理,图片抓取,代理以及线程操作等等


2.当前是基础学习,因此并未使用beautifulsoup,scrapy 等高级库

最新文章

123

最新摄影

微信扫一扫

第七城市微信公众平台