爬虫入门二

Requests使用

Requests官方文档

第三方参考文档

请求网页

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import requests

# requests.get
try:
url = "http://www.baidu.com/"
response = requests.get(url)
# 如果status_code不是200,产生异常requests.HTTPError
response.raise_for_status()
except:
print("raise exception")

# 保存文件到本地
with open('baidu.html', 'wb') as f:
f.write(response.content)
# 返回请求链接
print(response.url)
# 获取网页bytes文件
print(response.content)
# 获取网页Unicode文件
print(response.text)
# 从HTTP header中猜测的响应内容的编码方式
print(response.encoding)
# 从内容中分析响应内容的编码方式(备选编码方式)
print(response.apparent_encoding)
# 返回状态码
print(response.status_code)
# 返回请求头
print(response.request.headers)
# 返回请求的cookies
print(response.request._cookies)
# 返回请求响应头
print(response.headers)
# 返回响应cookies
print(response.cookies)

增加请求头

1
2
3
4
5
6
7
8
import requests
url = "http://www.baidu.com/"
headers = {
"User=Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}

response = requests.get(url, headers=headers)
print(response.status_code)

Get方法传参

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import requests

url = 'https://www.baidu.com/s?'

headers = {
"User=Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}

params = {
'wd':"风景",
'ie': 'utf-8',
'mod': '1',
'isbd': '1'
}

response = requests.get(url, headers=headers, params=params)
print(response.url)
with open('03-baidu.html', 'wb') as f:
f.write(response.content)

Post请求

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import requests

# request.post

log_url = 'https://www.yaozh.com/login'

headers = {
"User=Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
'Host': 'www.yaozh.com',
'Origin': 'https://www.yaozh.com',
'Referer': 'https://www.yaozh.com/login/proxy?time=1602222533216'
}

params = {
'username': '*****',
'pwd': '*****',
'formhash': 'C14C9D4CF9',
'backurl': r'https%3A%2F%2Fwww.yaozh.com%2F'
}

response = requests.post(log_url, headers=headers, data=params)
print(response.text)

设置代理

1
2
3
4
5
6
7
8
9
10
11
12
import requests
# requests 增加请求头
url = "http://www.baidu.com/"
headers = {
"User=Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}

proxy = {
'http':'1.198.73.11:9999'
}
response = requests.get(url, headers=headers, proxies=proxy)
print(response.status_code)

忽略ssl验证

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import requests

url = 'https://www.12306.cn/mormhweb/'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}

# 因为hhtps 是有第三方 CA 证书认证的
# 但是 12306 虽然是https 但是 它不是 CA证书, 他是自己 颁布的证书
# 解决方法 是: 告诉 web 忽略证书 访问
response = requests.get(url=url, headers=headers, verify=False)
data = response.content

with open('03-ssl.html', 'wb') as f:
f.write(data)

HTTP 认证

1
2
3
4
5
6
7
8
9
10
import requests
# auth = (username, pwd)
auth = ('admin', 'admin')
url = 'https://ssr3.scrape.center/'

headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}
response = requests.get(url, headers=headers, auth=auth)
print(response.text)

处理cookies

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import requests

# request.post

log_url = 'https://www.yaozh.com/login'

headers = {
"User=Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
'Host': 'www.yaozh.com',
'Origin': 'https://www.yaozh.com',
'Referer': 'https://www.yaozh.com/login/proxy?time=1602222533216'
}

params = {
'username': '*****',
'pwd': '******',
'formhash': 'C14C9D4CF9',
'backurl': r'https%3A%2F%2Fwww.yaozh.com%2F'
}
# session 类 可以自动保存cookies === cookiesJar
session = requests.session()
# 1. 代码登录
response = session.post(log_url, headers=headers, data=params)
print(response.text)

center_url = 'https://www.yaozh.com/member/'
# 2.登录成功之后 带着 有效的cookies 访问 请求目标数据
response = session.get(center_url, headers=headers)
with open('01-yaozhi.html', 'wb') as f:
f.write(response.content)