Python爬虫使用urllib and requests
urllib
参考资料:
python爬虫从入门到放弃(三)之 Urllib库的基本使用
Python爬虫入门三之Urllib库的基本使用
Python3的urllib.parse常用函数小结
python3 get和post请求
模块
urllib.request 请求模块
urllib.error 异常处理模块
urllib.parse url解析模块
urllib.robotparser robots.txt解析模块
1
| from urllib import request
|
request.Request
1 2
| req = request.Request("url") req.add_header("Cookie", "PHPSESSID=ga0un6plm7tea9li11bgnommh1")
|
两种添加头部信息的方法
1 2 3 4 5 6 7 8 9 10 11 12 13 14
| from urllib import request, parse
url = 'http://httpbin.org/post' headers = { 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', 'Host': 'httpbin.org' } dict = { 'name': 'zhaofan' } data = bytes(parse.urlencode(dict), encoding='utf8') req = request.Request(url=url, data=data, headers=headers, method='POST') response = request.urlopen(req) print(response.read().decode('utf-8'))
|
1 2 3 4 5 6 7 8 9 10 11
| from urllib import request, parse
url = 'http://httpbin.org/post' dict = { 'name': 'Germey' } data = bytes(parse.urlencode(dict), encoding='utf8') req = request.Request(url=url, data=data, method='POST') req.add_header("Cookie", "PHPSESSID=ga0un6plm7tea9li11bgnommh1") response = request.urlopen(req) print(response.read().decode('utf-8'))
|
request.urlopen
1 2 3 4
| urlopen(url, data, timeout)
|
打开一个百度链接
1 2 3 4 5
| from urllib import request
response = request.urlopen("http://www.baidu.com") print(response.read().decode('utf-8')) print(response.getheaders())
|
可以用for循环修改打印的格式
1 2
| for a,b in response.getheaders(): print('%s: %s' % (a, b))
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
| Date: Mon, 16 Apr 2018 02:31:13 GMT Content-Type: text/html; charset=utf-8 Transfer-Encoding: chunked Connection: Close Vary: Accept-Encoding Set-Cookie: BAIDUID=05730C3C016AA6027A951688792E8891:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com Set-Cookie: BIDUPSID=05730C3C016AA6027A951688792E8891; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com Set-Cookie: PSTM=1523845873; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com Set-Cookie: BDSVRTM=0; path=/ Set-Cookie: BD_HOME=0; path=/ Set-Cookie: H_PS_PSSID=1456_21112_20929; path=/; domain=.baidu.com P3P: CP=" OTI DSP COR IVA OUR IND COM " Cache-Control: private Cxy_all: baidu+6d6c985d736e75076f923a5ec17088e3 Expires: Mon, 16 Apr 2018 02:30:23 GMT X-Powered-By: HPHP Server: BWS/1.1 X-UA-Compatible: IE=Edge,chrome=1 BDPAGETYPE: 1 BDQID: 0xee6d60d000001ac3 BDUSERID: 0
|
urllib.parse url解析薄块
1
| from urllib import parse
|
不知道怎么讲,直接上一道CTFweb题目吧,这是实验吧的一道web题,
原题链接:http://www.shiyanbar.com/ctf/1854
解题链接:http://ctf5.shiyanbar.com/web/10/10.php
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
| from urllib import request,parse import base64
def main(): url = 'http://ctf5.shiyanbar.com/web/10/10.php' req = request.Request(url) req.add_header("Cookie", "PHPSESSID=ga0un6plm7tea9li11bgnommh1") response = request.urlopen(req) head = response.getheaders()[3][1] post = base64.b64decode(head).decode('utf-8')[25:] data = parse.urlencode([('key',post)]) response = request.urlopen(req,data.encode('utf-8')) print(response.read().decode('utf-8'))
main()
|
get请求
1 2 3 4 5 6 7 8 9
| from urllib import request,parse
data={} data['name']='aaa' url_parame=parse.urlencode(data) url="http://xxxx" all_url=url+url_parame response=request.urlopen(all_url).read() print(response.read())
|
post请求
1 2 3 4 5 6 7 8 9 10
| from urllib import request,parse
url = 'http://xxxxxx' user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' values = {'name' : 'aaa'} headers = { 'User-Agent' : user_agent } data = parse.urlencode(values) req = request.Request(url+'?'+data) response = request.urlopen(req) print(response.read().decode('utf-8'))
|
requests
参考资料:
http://docs.python-requests.org/zh_CN/latest/user/quickstart.html
http://docs.python-requests.org/zh_CN/latest/user/advanced.html
安装requests
pip install requests
pip list # 查看
get请求
1 2 3 4
| import requests
r = requests.get("http://xxxxx?name=aaa") print(r.text)
|
post请求
1 2 3 4 5
| import requests
postdata = { 'name':'aaa' } r = requests.post("http://xxxxx?name=aaa",data=postdata) print(r.text)
|