1.get用法
不带参数的
#!/user/bin/python
#-*-coding:utf-8-*
import requests
url="http://www.baidu.com"
r=requests.get(url)
print r.text
带参数的get请求
#!/user/bin/python
#-*-coding:utf-8-*-
import requests
url="http://www.baidu.com"
payload={'key1':'value1','key2':'value2'}
r=requests.get(url,params=payload)
print r.url
print r.text
加入headres
import requestspayload = {'key1': 'value1', 'key2': 'value2'}
headers = {'content-type': 'application/json'}
r = requests.get("http://httpbin.org/get", params=payload, headers=headers)
print r.url
>>> import l8
http://www.baidu.com/?key2=value2&key1=value1
2.post请求
一个http请求包括三个部分,为别为请求行,请求报头,消息主体,类似以下这样:
请求行
请求报头
消息主体
HTTP协议规定post提交的数据必须放在消息主体中,但是协议并没有规定必须使用什么编码方式。服务端通过是根据请求头中的Content-Type字段来获知请求中的消息主体是用何种方式进行编码,再对消息主体进行解析。具体的编码方式包括:
application/x-www-form-urlencoded 最常见post提交数据的方式,以form表单形式提交数据。application/json 以json串提交数据。
multipart/form-data 一般使用来上传文件。
以form形式发送post请求
Reqeusts支持以form表单形式发送post请求,只需要将请求的参数构造成一个字典,然后传给requests.post()的data参数即可
url = 'http://httpbin.org/post'
d = {'key1': 'value1', 'key2': 'value2'}
r = requests.post(url, data=d)
print r.text
输出结果
{
“args”: {},
“data”: “”,
“files”: {},
“form”: {
“key1”: “value1”,
“key2”: “value2”
},
“headers”: {
……
“Content-Type”: “application/x-www-form-urlencoded”,
……
},
“json”: null,
……
}
以json形式发送post请求
可以将一json串传给requests.post()的data参数,
url = 'http://httpbin.org/post'
s = json.dumps({'key1': 'value1', 'key2': 'value2'})
r = requests.post(url, data=s)
print r.text
输出结果
{
“args”: {},
“data”: “{/”key2/”: /”value2/”, /”key1/”: /”value1/”}”,
“files”: {},
“form”: {},
“headers”: {
……
“Content-Type”: “application/json”,
……
},
“json”: {
“key1”: “value1”,
“key2”: “value2”
},
……
}
通过上述方法,我们可以POST JSON格式的数据
如果想要上传文件,那么直接用 file 参数即可
新建一个 a.txt 的文件,内容写上 Hello World!
以multipart形式发送post请求
Requests也支持以multipart形式发送post请求,只需将一文件传给requests.post()的files参数即可。
url = 'http://httpbin.org/post'
files = {'file': open('report.txt', 'rb')}
r = requests.post(url, files=files)
print r.text
输出结果{
“args”: {},
“data”: “”,
“files”: {
“file”: “Hello world!”
},
“form”: {},
“headers”: {……
“Content-Type”: “multipart/form-data; boundary=467e443f4c3d403c8559e2ebd009bf4a”,
……
},
“json”: null,
beautifulsoup基本用法
自定义测试html,从html文本中获取soup
html = '''
Hello World
This is link1
This is link2
'''
from bs4 import BeautifulSoup
# 这里指定解析器为html.parser(python默认的解析器),指定html文档编码为utf-8
soup = BeautifulSoup(html,'html.parser',from_encoding='utf-8')
print type(soup)
print soup
#print soup的结果
Hello World
This is link1
This is link2
# 输出:
1.soup.select()函数用法
获取指定标签的内容
from bs4 import BeautifulSoup as bs
soup=bs(html,'html.parser')
header = soup.select('h1')#是一个列表
print type(header)#是一个列表
print header#打印出一个列表,内容是一个html标签
print header[0]#打印出一个列表,内容是一个html标签
print type(header[0])#打出一个类,内容是一个tag标签
print header[0].text#打印出列表中的内容
# 输出
'''
[
Hello World
]Hello World
Hello World
'''
1 html = '''
2
3
4
Hello World
5This is link1
6This is link2
7
8
9 '''
10 from bs4 import BeautifulSoup as bs
11 soup=bs(html,'html.parser',from_encoding='utf-8')
12 a_links=soup.select('a')
13 l=[x.text for x in a_links]
14 print l
15 print a_links
16 print type(a_links)
17 print a_links[0]
18 print type(a_links[0])
19 print a_links[0].text
20 print a_links[0].text
>>> import l9
[u'This is link1', u'This is link2']
[This is link1, This is link2]
This is link1
This is link1
This is link1
>>>
2.获取指定id的标签的内容(用’#’)
html = '''2
3
4
Hello World
5This is link1
6This is link2
7
8
9 '''
10 from bs4 import BeautifulSoup as bs
11 soup=bs(html,'html.parser',from_encoding='utf-8')
12 title=soup.select('#title')
13 print title
14 print type(title)
15 print title[0]
16 print type(title[0])
17 print title[0].text
18 >>> import l9
[
Hello World
]Hello World
Hello World
>>>
3.获取指定class的标签的内容(用’.’)
from bs4 import BeautifulSoup as bs
2 html = '''
3
4
5
Hello World
6This is link1
7This is link2
8
9
10'''
11 soup=bs(html,'html.parser')
12 h=soup.select('a.link')
13 print h
print [x.text for x in h]
14 for i in [x.text for x in h]:
15 print i
>>> import l9
[u'This is link1', u'This is link2']
[This is link1, This is link2]
This is link1
This is link2一.回顾
1.在前面的笔记中,学习了三种抓取办法。
使用select()函数获取标签,但是获取标签的方法有三种;第一种是直接获取的标签('tag'),第二种方法是获取id的属性('#id属性'),第三种方法是获取class属性('.class属性')
2.前面的笔记根据html页面的特性进行的:
(1)selecet('tag')可以获取所有的tag
(2)“#”用于获取制定id的内容
(3)“.”用于获取指定class的标签内容
二.下面介绍以下剩余的标签
1.获取a标签的链接(href属性值)2.获取一个标签下所有的子标签的text
代码示例:
from bs4 import BeautifulSoup as bs
import requests
html = '''
Hello World
This is link1
This is link2
'''
soup=bs(html,'html.parser')
alinks=soup.select('a')
a=[x.text for x in alinks]
print (a)
for i in a:
print (i)
print (alinks[0]['href'])
输出结果:
['This is link1', 'This is link2']
This is link1
This is link2
#link1
from bs4 import BeautifulSoup as bs
import requests
html = '''
Hello World
This is link1
This is link2
'''
soup=bs(html,'html.parser')
a=soup.select('h1')
b=[x.text for x in a]
print(b)
'''soup=bs(html,'html.parser')
a=soup.select('#title')
b=[x.text for x in a]
print (b)
soup=bs(html,'html.parser')
alinks=soup.select('a')
soup=bs(html,'html.parser')
h_id=soup.select('.link')
a=[x.text for x in h_id]
print (h_id[0]['href'])
print(a)
a=[x.text for x in alinks]
print (a)
for i in a:
print (i)
print (alinks[0]['href'])'''
4.获取一个标签下的所有子标签的text
1 from bs4 import BeautifulSoup as bs
2 html = '''
3
4
5
Hello World
6This is link1
7This is link2
8
9
10 '''
11 soup=bs(html,'html.parser')
12 h=soup.select('body')
13 h=soup.select('body')[0]
14 print type(h)
15 print h
16 print h
17 print h.text#输出结果
Hello World
This is link1
This is link2
Hello World
This is link1
This is link2
Hello World
This is link1
This is link2
5.soup.find()和soup.find_all()函数用法
find()和find_all()函数原型
find和find_all函数都可根据多个条件从html文本中查找标签对象,只不过find的返回对象类型为bs4.element.Tag,为查找到的第一个满足条件的Tag。而find_all的返回对象为bs4.element.ResultSet(实际上就是Tag列表)。 find(name=None, attrs={}, recursive=True, text=None, **kwargs)
#其中name、attrs、text的值都支持正则匹配。
find_all(name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
#其中name、attrs、text的值都支持正则匹配
代码示例:
1 from bs4 import BeautifulSoup as bs
2 html = '''
3
4
5
Hello World
6This is link1
7This is link2
8
9
10'''
11 soup=bs(html,'html.parser')
12 h=soup.find('a')
13 print type(h)
14 print h.text
15 print h['href']
16 print h['class']
#输出结果
>>> import l9
This is link1
#link1
[u'link']from bs4 import BeautifulSoup as bs
html = ''
soup=bs(html,'html.parser')
h=soup.find('p').find('a')
print type(h)
print h['href']
print h['class']
print h.text#输出结果
>>> import l9
https://my.oschina.net/u/3754854/blog/www.test.com
[u'mylink1', u'mylink2']
this is my link