爬虫学习

爬虫学习

urllib

python内置的http请求库

4大模块:

1.request模块:最基本的http请求模块

2.error模块:异常处理模块

3.parse模块:工具模块,提供url的处理方法

4.robotparser模块:识别robot.txt

request

第一个爬虫

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#!usr/bin/env python

import urllib.request

#发送一个最基本的请求
def load_baidu_data():
url = 'http://www.baidu.com/'
response = urllib.request.urlopen(url)
#print(response.code)
data = response.read()
#将data获取到的东西转换为字符串
str_data = data.decode('utf-8')
#print(str_data)
#将数据写入文件
with open('baidu.html',"w",encoding="utf-8") as f:
f.write(str_data)

load_baidu_data()

get请求

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!usr/bin/env python

import urllib.request
import urllib.parse
import string

#发送get型请求 单个参数
def load_baidu_data():
url = 'https://www.baidu.com/s?wd='
#url+search
name = "卢本伟"
final_url = url+name
#网址里包含了中文 需要进行转码
encode_url = urllib.parse.quote(final_url,safe=string.printable)
print(encode_url)

response = urllib.request.urlopen(encode_url)
print(response.code)

load_baidu_data()


#!usr/bin/env python
import urllib.request
import urllib.parse
import string

#发送get型请求 多个参数
def load_baidu_data():
url = 'https://www.baidu.com/s?'
#使用params字典
params = {
"wd":"孙悟空",
"pn":"80"
}
query_str = urllib.parse.urlencode(params)
print(query_str)
final_url = url+query_str
print(final_url)

response = urllib.request.urlopen(final_url)
#print(response.code)
str_data = response.read().decode('utf-8')
print(str_data)
with open('baidu-wukong-pn80.html',"w",encoding="utf-8") as f:
f.write(str_data)
load_baidu_data()

post请求

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#!usr/bin/env python

import urllib.request
import urllib.parse
import string

#定义url
#用于测试的网站: http://httpbin.org/post
url = 'http://httpbin.org/post'
#创建要发送的数据表单
data = {
'hello':'world',
'handsome':'r1cky'
}
#data要进行编码
data_encode = urllib.parse.urlencode(data).encode('utf-8')

#加上encode('uft-8') str --> bytes 转换成bytes传输
#加上decode("utf-8") bytes-->str 转换成str传输

#发送post请求
response = urllib.request.urlopen(url=url,data=data_encode)
print(response.read().decode("utf-8"))

超时控制:timeout

1
2
3
4
5
6
7
8
9
#超过0.1秒没有响应就报错
response = urllib.request.urlopen(url=url,data=data_encode,timeout=0.1)

#异常处理:
try:
response = urllib.request.urlopen(url=url,data=data_encode,timeout=0.1)
print(response.read().decode("utf-8"))
except urllib.error.URLError as e:
print("连接超时")

自定义user-agent

爬虫的核心就是模拟用户!!!

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#!usr/bin/env python

import urllib.request
import urllib.error
import urllib.parse
import string

#定义url
#用于测试的网站: http://httpbin.org/post
url = 'http://httpbin.org/post'

#自定义request 添加user-agent
header = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
"hello":"world"
}

req = urllib.request.Request(url=url,headers=header,method='POST')
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))

随机UA头小项目

每次请求,都切换不同的user-agent

UA头数据网站:UserAgent大全_常用的UserAgent库下载_UserAgent在线工具_流行的UserAgent手机库UA_UA标识网 (kuzhazha.com)

思路:

1.定义多个ua头 列表

2.每次请求时切换ua头

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#!usr/bin/env python

import urllib.request
import urllib.error
import urllib.parse
import random

def user_agent():
url = 'http://httpbin.org/post'
#定义多个user-agent
user_agent_list = [
"Mozilla/5.0 (Linux; Android 12; ELS-AN00 Build/HUAWEIELS-AN00; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/107.0.5304.141 Mobile Safari/537.36 XWEB/5075 MMWEBSDK/20230504 MMWEBID/9308 MicroMessenger/8.0.37.2380(0x2800253D) WeChat/arm64 Weixin NetType/5G Language/zh_CN ABI/arm64 MiniProgramEnv/android",
"Mozilla/5.0 (iPhone; CPU iPhone OS............ile/15E148 MicroMessenger/8.0.34(0x18002234) NetType/4G Language/zh_CN",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; hu; rv:1.8.0.11) Gecko/20070312 Firefox/1.5.0.11",
"Mozilla/5.0 (Macintosh; Int............ecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67",
"Mozilla/5.0 (X1............7.6) Gecko/20050318 Firefox/1.0.2",
"Mozilla/5.0 (Windows; U; Win............o) Chrome/100.0.4896.58 Safari/537.36 UCBrowser/15.4.8.1238",
]
#使用random模块 随机取user_agent_list中的一个
random_user_agent = random.choice(user_agent_list)
req = urllib.request.Request(url=url,method='POST')
#添加ua头
req.add_header("User-Agent",random_user_agent)
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))


user_agent()

自定义opener

handler:
HTTPDefaultErrorHandler 用于处理 HTTP 响应错误,错误都会抛出 HTTPError 类型的异常。
HTTPRedirectHandler 用于处理重定向。
HTTPCookieProcessor 用于处理 Cookies。
ProxyHandler 用于设置代理,默认代理为空。
HTTPPasswordMgr 用于管理密码,它维护了用户名密码的表。
HTTPBasicAuthHandler 用于管理认证,如果一个链接打开时需要认证,那么可以用它来解决认证问题。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#!usr/bin/env python

import urllib.request
import urllib.error
import urllib.parse
import random

def handler_open():
url = 'http://httpbin.org/get'
#创建自己的opener
handler = urllib.request.HTTPHandler()
opener = urllib.request.build_opener(handler)
response = opener.open(url)
print(response.read().decode("utf-8"))

handler_open()

配置代理

代理的类型:
透明:对方可以知道我们的ip
匿名:对方不知道我们的ip,but知道我们使用了代理
高匿:对方不知道我们的ip,and不知道我们使用了代理

免费代理 大都不靠谱

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#!usr/bin/env python

import urllib.request
import urllib.error
import urllib.parse
import random

def handler_open():
try:
url = 'http://httpbin.org/post'
#添加代理 ip+端口
proxy = {
"http":"http://ip:port"

}
#创建代理处理器
proxy_handler = urllib.request.ProxyHandler(proxy)
#创建自己的opener
opener = urllib.request.build_opener(proxy_handler)
response = opener.open(url)
print(response.read().decode("utf-8"))
except urllib.error.URLError as e:
print("error: ",e)


handler_open()

随机代理小项目

代理购买:快代理 - 企业级HTTP代理IP云服务 (kuaidaili.com)

配置代理池:

image-20230911162945323

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#!usr/bin/env python

import urllib.request
import urllib.error
import urllib.parse
import random

def proxy_ip():
url = 'http://httpbin.org/get'
#定义多个user-agent
ip_list = [
"http://111.224.217.xxx:19394",
"http://114.220.35.xxx:19942",
"http://117.87.209.xxx:23350",
"http://121.233.172.xxx:22936",
"http://182.38.126.xxx:16113"
]
#每次取一个ip
proxy = random.choice(ip_list)
try:
#创建代理处理器
proxy_handler = urllib.request.ProxyHandler({'http':proxy,'https':proxy})
#创建自己的opener
opener = urllib.request.build_opener(proxy_handler)
response = opener.open(url)
print(response.read().decode("utf-8"))
except urllib.error.URLError as e:
print("error: ",e)


proxy_ip()
image-20230911164947562

授权认证

身份认证:

HTTPBasicAuthHandler(用户基本的身份验证处理)
HTTPPasswordMgrWithDefaultRealm(经常和authhandler一起出现)

需要认证的测试网站:https://ssr3.scrape.center/ (admin admin)

image-20230911170845042

直接访问:

image-20230911170948737

1
2
3
4
#创建一个密码管理器
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
#添加进目标url,用户名 密码
password_mgr.add_password(None,url,username,password)

第一个参数为NONE,表示默认的域
如果需要添加不同域的凭证可以将none替换为对应的域名

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#!/usr/bin/env python

import urllib.request
from urllib.parse import urlparse

def auth_login():
url = "https://ssr3.scrape.center/"
#指定用户名和密码
username = "admin"
password = "admin"
#创建密码管理器
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
#添加目标url 用户名 密码
password_mgr.add_password(None,url,username,password)
#创建一个基本密码认证处理器,并将密码管理器传递给他
handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
#创建网络请求的构造器
opener = urllib.request.build_opener(handler)
response = opener.open(url)
print(response.code)
print(response.read().decode('utf-8'))
auth_login()
image-20230911170929280

cookie认证

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#!/usr/bin/env python

import urllib.request
from urllib.parse import urlparse

def auth_login():
url = "https://ssr3.scrape.center/"
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
#添加认证字段 admin:admin
"Authorization":"Basic YWRtaW46YWRtaW4="
}
req = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))

auth_login()

cookiejar 读写文件

写:

MozillaCookieJar 将cookie写进文件
可以将cookies保存成Mozilla型浏览器的cookies格式

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#将cookie写入文件
#!/usr/bin/env python

import urllib.request
import http.cookiejar

filename = 'cookies.txt'
#MozillaCookieJar 将cookie写进文件
#可以将cookies保存成Mozilla型浏览器的cookies格式
cookie = http.cookiejar.MozillaCookieJar(filename=filename)
#保存成 libwww-perl(LWP) 格式cookies文件:
#cookie = http.cookiejar.LWPCookieJar(filename=filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)

response = opener.open("http://www.baidu.com")

# for item in cookie:
# print(item)

cookie.save(ignore_discard=True,ignore_expires=True)

image-20230911213251107

LWP格式:

image-20230912094156773

读:
LWPCookieJar
保存成 libwww-perl(LWP) 格式cookies文件:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#读取cookie  携带该cookie去访问网页
#!/usr/bin/env python

import urllib.request
import http.cookiejar


filename = 'cookies.txt'
#读:
cookie = http.cookiejar.LWPCookieJar()
cookie.load(filename,ignore_discard=True,ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)

response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))


小项目 – 使用cookie登录网站

测试网站:https://www.yaozh.com/login

image-20230912103624808

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/env python

import urllib.request
import http.cookiejar
import urllib.parse

#1.使用账号密码登录网站

#登录url:
url = 'https://www.yaozh.com/login'

#登录的参数:
login_data = {
"type":"0",
"username":"xxx",
"pwd":"xxx",
"pc_vcode":"86_zh-CN",
"country":"",
"mobile":"",
"vcode":"",
"pincode":"",
"formhash":"636BCA3396",
"backurl":"%252F%252Fwww.yaozh.com%252F",
}
#发送登录请求
cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor()
opener = urllib.request.build_opener(handler)
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",

}
#将请求参数转化为bytes
login_str = urllib.parse.urlencode(login_data).encode('utf-8')

req = urllib.request.Request(url=url,headers=headers,data=login_str)
#如果登录成功,cookiejar会自动保存cookie
opener.open(req)

#带着cookie去访问 登录后才能访问的页面
#如果请求成功 则表明cookie保存成功
login_url = "https://www.yaozh.com/member"

req2 = urllib.request.Request(login_url,headers=headers)
response = opener.open(login_url)
data = response.read()

with open('cookie.html',"wb") as f:
f.write(data)
image-20230912103538210

异常处理

防止程序健壮性较差

urllib的error模块:

urlerror:继承OSError

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
try:

except error.URLError as e:
print(e.reason) #打印错误原因

测试:
#!/usr/bin/env python

import urllib
from urllib import request,error

try:
#定义不存在的路径,并且不带账户密码访问
url = "https://ssr3.scrape.center/asdasd"
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
}
req = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))

except error.URLError as e:
print(e.reason)

image-20230912104854648

HTTPError

用于处理HTTP请求

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#!/usr/bin/env python

import urllib
from urllib.error import *
from urllib import request

try:
url = "https://ssr3.scrape.center/asdasd"
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
}
req = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))

except HTTPError as e:
print("http error: ",e)

image-20230912105624185

超时异常 — URLError处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#!/usr/bin/env python

import urllib
from urllib.error import *
import urllib.request
import socket

try:
# url = "https://ssr3.scrape.center/asdasd"
# headers = {
# "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
# }
# req = urllib.request.Request(url=url,headers=headers)
# response = urllib.request.urlopen(req)
# print(response.read().decode('utf-8'))
url = "https://www.baidu.com"
#设置超时时间为0.01 触发超时错误
response = urllib.request.urlopen(url=url,timeout=0.01)

#except HTTPError as e:
#print("http error: ",e)
except URLError as e:
print(e.reason)
if isinstance(e.reason,socket.timeout):
print("Time out!!")

链接的组成和解析

urlparse:python标准库中的一个模块,解析和操作url

  • 标准url链接格式:

scheme://netloc/path;params?query#fragment

scheme(协议) http or https
netloc(网络位置) host
path(路径)
params(携带的参数)
query(查询参数)
fragment(片段) 内部导航

1
2
3
4
5
6
7
8
9
10
11
12
#!/usr/bin/env python

from urllib.parse import urlparse

url = 'http://www.baidu.com/index.html;user?id=0#comment'

result = urlparse(url=url)
print(type(result),result)

结果:
<class 'urllib.parse.ParseResult'>
ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=0', fragment='comment')
  • urlunparse:与urlparse相反,组合url
1
2
3
4
5
6
7
from urllib.parse import urlparse, urlunparse
data = ['http','www.baidu.com','index.html','user','id=0','comment']
url = urlunparse(data)
print(url)

结果:
http://www.baidu.com/index.html;user?id=0#comment
  • urlsplit:与urlparse类似,用于将url参数分割成不同的部分,但不会解析查询参数和片段部分

返回的是一个元组 可以用索引取值

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from urllib.parse import urlparse, urlunparse,urlsplit
url = 'http://www.baidu.com/index.html;user?id=0#comment'
parts = urlsplit(url)
print(parts)
print(parts.scheme)
print(parts[0])
print(parts.netloc)
print(parts.path)
# print(parts.params)
print(parts.query)
print(parts.fragment)

结果:
SplitResult(scheme='http', netloc='www.baidu.com', path='/index.html;user', query='id=0', fragment='comment')
http
http
www.baidu.com
/index.html;user
id=0
comment

#urlunsplit 组合
data = ('http','www.baidu.com','index.html','id=0','comment')
print(urlunsplit(data))
结果:
http://www.baidu.com/index.html?id=0#comment
  • urljoin:urljoin()是连接一个基本URL和一个可能的相对URL来形成一个绝对URL

参数:

base 基本url,通常是一个绝对的url

url 这个是相对的url

1
2
3
4
5
6
7
8
9
from urllib.parse import urlparse, urlunparse,urlsplit,urlunsplit,urljoin
base_url = 'https://www.baidu.com'
relative_url = 'path/to/xxx'

url = urljoin(base=base_url,url=relative_url)
print(url)

结果:
https://www.baidu.com/path/to/xxx

注意:base_url 提供了三项内容 scheme、netloc 和 path。如果这 3 项在新的链接里不存在,就予以补充;如果新的链接存在,就使用新的链接的部分。而 base_url 中的 params、query 和 fragment 是不起作用的。

  • parse_qs

get参数的序列化

  • parse_qsl

将参数转为元组组成列表

1
2
3
4
5
6
7
8
from urllib.parse import urlparse, urlunparse,urlsplit,urlunsplit,urljoin,parse_qs 
query = 'name=handsomer1&age=18'
print(parse_qs(query))
print(parse_qsl(query))

结果:
{'name': ['handsomer1'], 'age': ['18']}
[('name', 'handsomer1'), ('age', '18')]
  • quote

url编码

  • unquote

url解码

1
2
3
4
5
6
7
8
9
10
from urllib.parse import quote, unquote
keyword = "卢本伟"
url = 'https://www.baidu.com/s?wd='+quote(keyword)
print(url)
url_1 = unquote(url)
print(url_1)

结果:
https://www.baidu.com/s?wd=%E5%8D%A2%E6%9C%AC%E4%BC%9F
https://www.baidu.com/s?wd=卢本伟
  • robotparser模块

用于解析robots.txt

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#!/usr/bin/env python

from urllib.robotparser import RobotFileParser

#创建一个RobotFileParser对象用于解析robots.txt
robot_parser = RobotFileParser()
#robot_parser.set_url('https://www.baidu.com/robots.txt')
robot_parser.set_url('https://www.zhihu.com/robots.txt')
#读取并解析robots.txt
robot_parser.read()

#检查是否可以爬取特定的url
user_agent = "BaiduSpider"
#check_url = 'https://www.baidu.com/baidu'
check_url = 'https://www.zhihu.com'

#can_fetch 确定指定的用户代理是否允许访问网页
if robot_parser.can_fetch(user_agent,check_url):
print("可以爬取此url")
else:
print("不能爬取此url")

requests库

普通请求 get post put …

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!/usr/bin/env python

import requests

#get
# r = requests.get('http://httpbin.org/get')
# print(r.text)

#添加参数的get请求
# data = {
# 'name':'handsomewuyue',
# 'age':'18'
# }
# r = requests.get('http://httpbin.org/get',params=data)
# print(r.text)

#post
# data = {
# 'name':'handsomewuyue',
# 'age':'18'
# }
# r = requests.post('http://httpbin.org/post',data=data)
# print(r.text)

# r = requests.put('http://httpbin.org/put')
# r = requests.delete('http://httpbin.org/delete')
# r = requests.head('http://httpbin.org/head')
# r = requests.options('http://httpbin.org/options')

headers = {
'Usrt-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
r = requests.get('https://www.zhihu.com',headers=headers)
print(r.text)

高级用法(文件上传 requestcookiejar session等)

文件上传

1
2
3
4
5
6
7
import requests

url = 'http://httpbin.org/post'
#上传图片文件
file = {'file':open('favicon.ico','rb')}
r = requests.post(url,files=file)
print(r.text)

获取cookie

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#!/usr/bin/env python

import requests

url = 'http://www.zhihu.com'
# r = requests.get(url=url)
# print(r.cookies)
# for k,v in r.cookies.items():
# print(k + "=" + v)

headers = {
'Cookie':xxx',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}

r = requests.get(url=url,headers=headers)
with open('1.html','wb') as f:
f.write(r.text.encode('utf-8'))

print(r.cookies)
for k,v in r.cookies.items():
print(k + "=" + v)

使用cookiejar保存cookie,并携带cookie访问网页

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#!/usr/bin/env python

import requests.cookies

Cookie = 'xxx'
#创建一个空的RequestsCookieJar来保存cookie
jar = requests.cookies.RequestsCookieJar()
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}

#遍历分割cookies字符串,将每一个cookie的键值对添加到cookiejar中
for cookie in Cookie.split(';'):
key,value = cookie.split('=',1)
jar.set(key,value)

#发送请求并且附带cookies和header
r = requests.get('https://www.zhihu.com',cookies=jar,headers=headers)
with open('2.html','wb') as f:
f.write(r.text.encode('utf-8'))

session

1
2
3
4
5
6
7
8
#!/usr/bin/env python

import requests

r1 = requests.get('http://httpbin.org/cookies/set/number/123456')
print(r1.text)
r2 = requests.get('http://httpbin.org/cookies')
print(r2.text)

image-20230912153149172

存在的问题:每次请求独立 第一次设置了cookie之后第二次请求不存在了

使用session:

1
2
3
4
5
6
7
8
9
#1.创建一个session
s = requests.Session()

#2.向网站发送get并且设置cookie
s.get('http://httpbin.org/cookies/set/number/123456')

#3.再次发送请求 获取会话中的cookie
r = s.get('http://httpbin.org/cookies')
print(r.text)

image-20230912153518207

ssl证书校验

测试网址:https://ssr2.scrape.center/

image-20230912153814371

缺少证书,需要自己点继续前往的情况

直接请求 报错:

image-20230912154002903

添加verify=False即可绕过

1
2
r = requests.get('https://ssr2.scrape.center/',verify=False)
print(r.status_code)

image-20230912154243221

超时

timeout参数

1
2
r = requests.get('https://ssr2.scrape.center/',verify=False,timeout=0.1)
print(r.status_code)

身份认证(httpbasicAuth)

1
2
3
4
from requests.auth import HTTPBasicAuth
#使用HTTPBasicAuth添加用户密码
r = requests.get('https://ssr3.scrape.center/',verify=False,auth=HTTPBasicAuth('admin','admin'))
print(r.text)

代理设置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#!/usr/bin/env python
from weakref import proxy
import requests
#http代理
proxy_list = {
'http':'http://代理地址:端口',
'http':'http://代理地址:端口'
}

requests.get('https://www.baidu.com',proxies=proxy_list)

#socks代理
proxies = {
'http':'sock5:username:password@host:port'
}
requests.get('https://www.baidu.com',proxies=proxies)

httpx

httpx库 爬取使用http2.0的网站

https://spa16.scrape.center/

image-20230929170208257

支持http2.0还需要安装2.0的支持:

pip install “httpx[http2]”

代码中配置:

client = httpx.Client(http2=True)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#!/usr/bin/env python3

import requests
import httpx

url = 'https://spa16.scrape.center/' #使用http2
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}
data = {
"name":"handsome r1"
}
client = httpx.Client(http2=True)
#response = client.get(url,headers=headers)
#post请求
response = client.post('https://www.httpbin.org/post',headers=headers,data=data)

print(response.text)

爬虫学习
http://example.com/2023/10/07/爬虫学习/
作者
r1
发布于
2023年10月7日
许可协议