爬虫学习 urllib python内置的http请求库
4大模块:
1.request模块:最基本的http请求模块
2.error模块:异常处理模块
3.parse模块:工具模块,提供url的处理方法
4.robotparser模块:识别robot.txt
request 第一个爬虫 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 import urllib.requestdef load_baidu_data (): url = 'http://www.baidu.com/' response = urllib.request.urlopen(url) data = response.read() str_data = data.decode('utf-8' ) with open ('baidu.html' ,"w" ,encoding="utf-8" ) as f: f.write(str_data) load_baidu_data()
get请求 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 import urllib.requestimport urllib.parseimport stringdef load_baidu_data (): url = 'https://www.baidu.com/s?wd=' name = "卢本伟" final_url = url+name encode_url = urllib.parse.quote(final_url,safe=string.printable) print (encode_url) response = urllib.request.urlopen(encode_url) print (response.code) load_baidu_data()import urllib.requestimport urllib.parseimport stringdef load_baidu_data (): url = 'https://www.baidu.com/s?' params = { "wd" :"孙悟空" , "pn" :"80" } query_str = urllib.parse.urlencode(params) print (query_str) final_url = url+query_str print (final_url) response = urllib.request.urlopen(final_url) str_data = response.read().decode('utf-8' ) print (str_data) with open ('baidu-wukong-pn80.html' ,"w" ,encoding="utf-8" ) as f: f.write(str_data) load_baidu_data()
post请求 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 import urllib.requestimport urllib.parseimport string url = 'http://httpbin.org/post' data = { 'hello' :'world' , 'handsome' :'r1cky' } data_encode = urllib.parse.urlencode(data).encode('utf-8' ) response = urllib.request.urlopen(url=url,data=data_encode)print (response.read().decode("utf-8" ))
超时控制:timeout 1 2 3 4 5 6 7 8 9 response = urllib.request.urlopen(url=url,data=data_encode,timeout=0.1 )try : response = urllib.request.urlopen(url=url,data=data_encode,timeout=0.1 ) print (response.read().decode("utf-8" ))except urllib.error.URLError as e: print ("连接超时" )
自定义user-agent 爬虫的核心就是模拟用户 !!!
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 import urllib.requestimport urllib.errorimport urllib.parseimport string url = 'http://httpbin.org/post' header = { "User-Agent" :"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36" , "hello" :"world" } req = urllib.request.Request(url=url,headers=header,method='POST' ) response = urllib.request.urlopen(req)print (response.read().decode('utf-8' ))
随机UA头小项目 每次请求,都切换不同的user-agent
UA头数据网站:UserAgent大全_常用的UserAgent库下载_UserAgent在线工具_流行的UserAgent手机库UA_UA标识网 (kuzhazha.com)
思路:
1.定义多个ua头 列表
2.每次请求时切换ua头
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 import urllib.requestimport urllib.errorimport urllib.parseimport randomdef user_agent (): url = 'http://httpbin.org/post' user_agent_list = [ "Mozilla/5.0 (Linux; Android 12; ELS-AN00 Build/HUAWEIELS-AN00; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/107.0.5304.141 Mobile Safari/537.36 XWEB/5075 MMWEBSDK/20230504 MMWEBID/9308 MicroMessenger/8.0.37.2380(0x2800253D) WeChat/arm64 Weixin NetType/5G Language/zh_CN ABI/arm64 MiniProgramEnv/android" , "Mozilla/5.0 (iPhone; CPU iPhone OS............ile/15E148 MicroMessenger/8.0.34(0x18002234) NetType/4G Language/zh_CN" , "Mozilla/5.0 (Windows; U; Windows NT 5.1; hu; rv:1.8.0.11) Gecko/20070312 Firefox/1.5.0.11" , "Mozilla/5.0 (Macintosh; Int............ecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67" , "Mozilla/5.0 (X1............7.6) Gecko/20050318 Firefox/1.0.2" , "Mozilla/5.0 (Windows; U; Win............o) Chrome/100.0.4896.58 Safari/537.36 UCBrowser/15.4.8.1238" , ] random_user_agent = random.choice(user_agent_list) req = urllib.request.Request(url=url,method='POST' ) req.add_header("User-Agent" ,random_user_agent) response = urllib.request.urlopen(req) print (response.read().decode("utf-8" )) user_agent()
自定义opener handler: HTTPDefaultErrorHandler 用于处理 HTTP 响应错误,错误都会抛出 HTTPError 类型的异常。 HTTPRedirectHandler 用于处理重定向。 HTTPCookieProcessor 用于处理 Cookies。 ProxyHandler 用于设置代理,默认代理为空。 HTTPPasswordMgr 用于管理密码,它维护了用户名密码的表。 HTTPBasicAuthHandler 用于管理认证,如果一个链接打开时需要认证,那么可以用它来解决认证问题。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 import urllib.requestimport urllib.errorimport urllib.parseimport randomdef handler_open (): url = 'http://httpbin.org/get' handler = urllib.request.HTTPHandler() opener = urllib.request.build_opener(handler) response = opener.open (url) print (response.read().decode("utf-8" )) handler_open()
配置代理 代理的类型: 透明:对方可以知道我们的ip 匿名:对方不知道我们的ip,but知道我们使用了代理 高匿:对方不知道我们的ip,and不知道我们使用了代理
免费代理 大都不靠谱
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 import urllib.requestimport urllib.errorimport urllib.parseimport randomdef handler_open (): try : url = 'http://httpbin.org/post' proxy = { "http" :"http://ip:port" } proxy_handler = urllib.request.ProxyHandler(proxy) opener = urllib.request.build_opener(proxy_handler) response = opener.open (url) print (response.read().decode("utf-8" )) except urllib.error.URLError as e: print ("error: " ,e) handler_open()
随机代理小项目 代理购买:快代理 - 企业级HTTP代理IP云服务 (kuaidaili.com)
配置代理池:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 import urllib.requestimport urllib.errorimport urllib.parseimport randomdef proxy_ip (): url = 'http://httpbin.org/get' ip_list = [ "http://111.224.217.xxx:19394" , "http://114.220.35.xxx:19942" , "http://117.87.209.xxx:23350" , "http://121.233.172.xxx:22936" , "http://182.38.126.xxx:16113" ] proxy = random.choice(ip_list) try : proxy_handler = urllib.request.ProxyHandler({'http' :proxy,'https' :proxy}) opener = urllib.request.build_opener(proxy_handler) response = opener.open (url) print (response.read().decode("utf-8" )) except urllib.error.URLError as e: print ("error: " ,e) proxy_ip()
授权认证 身份认证:
HTTPBasicAuthHandler(用户基本的身份验证处理) HTTPPasswordMgrWithDefaultRealm(经常和authhandler一起出现)
需要认证的测试网站:https://ssr3.scrape.center/ (admin admin)
直接访问:
1 2 3 4 password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm() password_mgr.add_password(None ,url,username,password)
第一个参数为NONE,表示默认的域 如果需要添加不同域的凭证可以将none替换为对应的域名
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 import urllib.requestfrom urllib.parse import urlparsedef auth_login (): url = "https://ssr3.scrape.center/" username = "admin" password = "admin" password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm() password_mgr.add_password(None ,url,username,password) handler = urllib.request.HTTPBasicAuthHandler(password_mgr) opener = urllib.request.build_opener(handler) response = opener.open (url) print (response.code) print (response.read().decode('utf-8' )) auth_login()
cookie认证 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 import urllib.requestfrom urllib.parse import urlparsedef auth_login (): url = "https://ssr3.scrape.center/" headers = { "User-Agent" :"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36" , "Authorization" :"Basic YWRtaW46YWRtaW4=" } req = urllib.request.Request(url=url,headers=headers) response = urllib.request.urlopen(req) print (response.read().decode('utf-8' )) auth_login()
cookiejar 读写文件 写:
MozillaCookieJar 将cookie写进文件 可以将cookies保存成Mozilla型浏览器的cookies格式
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 import urllib.requestimport http.cookiejar filename = 'cookies.txt' cookie = http.cookiejar.MozillaCookieJar(filename=filename) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open ("http://www.baidu.com" ) cookie.save(ignore_discard=True ,ignore_expires=True )
LWP格式:
读: LWPCookieJar 保存成 libwww-perl(LWP) 格式cookies文件:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 import urllib.requestimport http.cookiejar filename = 'cookies.txt' cookie = http.cookiejar.LWPCookieJar() cookie.load(filename,ignore_discard=True ,ignore_expires=True ) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open ('http://www.baidu.com' )print (response.read().decode('utf-8' ))
小项目 – 使用cookie登录网站 测试网站:https://www.yaozh.com/login
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 import urllib.requestimport http.cookiejarimport urllib.parse url = 'https://www.yaozh.com/login' login_data = { "type" :"0" , "username" :"xxx" , "pwd" :"xxx" , "pc_vcode" :"86_zh-CN" , "country" :"" , "mobile" :"" , "vcode" :"" , "pincode" :"" , "formhash" :"636BCA3396" , "backurl" :"%252F%252Fwww.yaozh.com%252F" , } cookie = http.cookiejar.CookieJar() handler = urllib.request.HTTPCookieProcessor() opener = urllib.request.build_opener(handler) headers = { "User-Agent" :"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36" , } login_str = urllib.parse.urlencode(login_data).encode('utf-8' ) req = urllib.request.Request(url=url,headers=headers,data=login_str) opener.open (req) login_url = "https://www.yaozh.com/member" req2 = urllib.request.Request(login_url,headers=headers) response = opener.open (login_url) data = response.read()with open ('cookie.html' ,"wb" ) as f: f.write(data)
异常处理 防止程序健壮性较差
urllib的error模块:
urlerror:继承OSError
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 try :except error.URLError as e: print (e.reason) 测试:import urllibfrom urllib import request,errortry : url = "https://ssr3.scrape.center/asdasd" headers = { "User-Agent" :"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36" , } req = urllib.request.Request(url=url,headers=headers) response = urllib.request.urlopen(req) print (response.read().decode('utf-8' ))except error.URLError as e: print (e.reason)
HTTPError
用于处理HTTP请求
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 import urllibfrom urllib.error import *from urllib import requesttry : url = "https://ssr3.scrape.center/asdasd" headers = { "User-Agent" :"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36" , } req = urllib.request.Request(url=url,headers=headers) response = urllib.request.urlopen(req) print (response.read().decode('utf-8' ))except HTTPError as e: print ("http error: " ,e)
超时异常 — URLError处理
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 import urllibfrom urllib.error import *import urllib.requestimport sockettry : url = "https://www.baidu.com" response = urllib.request.urlopen(url=url,timeout=0.01 ) except URLError as e: print (e.reason) if isinstance (e.reason,socket.timeout): print ("Time out!!" )
链接的组成和解析 urlparse:python标准库中的一个模块,解析和操作url
scheme://netloc/path;params?query#fragment
scheme(协议) http or https netloc(网络位置) host path(路径) params(携带的参数) query(查询参数) fragment(片段) 内部导航
1 2 3 4 5 6 7 8 9 10 11 12 from urllib.parse import urlparse url = 'http://www.baidu.com/index.html;user?id=0#comment' result = urlparse(url=url)print (type (result),result) 结果: <class 'urllib.parse.ParseResult' > ParseResult(scheme='http' , netloc='www.baidu.com' , path='/index.html' , params='user' , query='id=0' , fragment='comment' )
urlunparse:与urlparse相反,组合url
1 2 3 4 5 6 7 from urllib.parse import urlparse, urlunparse data = ['http' ,'www.baidu.com' ,'index.html' ,'user' ,'id=0' ,'comment' ] url = urlunparse(data)print (url) 结果: http://www.baidu.com/index.html;user?id =0
urlsplit:与urlparse类似,用于将url参数分割成不同的部分,但不会解析查询参数和片段部分
返回的是一个元组 可以用索引取值
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 from urllib.parse import urlparse, urlunparse,urlsplit url = 'http://www.baidu.com/index.html;user?id=0#comment' parts = urlsplit(url)print (parts)print (parts.scheme)print (parts[0 ])print (parts.netloc)print (parts.path)print (parts.query)print (parts.fragment) 结果: SplitResult(scheme='http' , netloc='www.baidu.com' , path='/index.html;user' , query='id=0' , fragment='comment' ) http http www.baidu.com /index.html;userid =0 comment data = ('http' ,'www.baidu.com' ,'index.html' ,'id=0' ,'comment' )print (urlunsplit(data)) 结果: http://www.baidu.com/index.html?id =0
urljoin:urljoin()是连接一个基本URL和一个可能的相对URL来形成一个绝对URL
参数:
base 基本url,通常是一个绝对的url
url 这个是相对的url
1 2 3 4 5 6 7 8 9 from urllib.parse import urlparse, urlunparse,urlsplit,urlunsplit,urljoin base_url = 'https://www.baidu.com' relative_url = 'path/to/xxx' url = urljoin(base=base_url,url=relative_url)print (url) 结果: https://www.baidu.com/path/to/xxx
注意:base_url 提供了三项内容 scheme、netloc 和 path。如果这 3 项在新的链接里不存在,就予以补充;如果新的链接存在,就使用新的链接的部分。而 base_url 中的 params、query 和 fragment 是不起作用的。
get参数的序列化
将参数转为元组组成列表
1 2 3 4 5 6 7 8 from urllib.parse import urlparse, urlunparse,urlsplit,urlunsplit,urljoin,parse_qs query = 'name=handsomer1&age=18' print (parse_qs(query))print (parse_qsl(query)) 结果: {'name' : ['handsomer1' ], 'age' : ['18' ]} [('name' , 'handsomer1' ), ('age' , '18' )]
url编码
url解码
1 2 3 4 5 6 7 8 9 10 from urllib.parse import quote, unquote keyword = "卢本伟" url = 'https://www.baidu.com/s?wd=' +quote(keyword)print (url) url_1 = unquote(url)print (url_1) 结果: https://www.baidu.com/s?wd=%E5%8D%A2%E6%9C%AC%E4%BC%9F https://www.baidu.com/s?wd=卢本伟
用于解析robots.txt
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 from urllib.robotparser import RobotFileParser robot_parser = RobotFileParser() robot_parser.set_url('https://www.zhihu.com/robots.txt' ) robot_parser.read() user_agent = "BaiduSpider" check_url = 'https://www.zhihu.com' if robot_parser.can_fetch(user_agent,check_url): print ("可以爬取此url" )else : print ("不能爬取此url" )
requests库 普通请求 get post put … 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 import requests headers = { 'Usrt-Agent' :'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36' } r = requests.get('https://www.zhihu.com' ,headers=headers)print (r.text)
高级用法(文件上传 requestcookiejar session等) 文件上传 1 2 3 4 5 6 7 import requests url = 'http://httpbin.org/post' file = {'file' :open ('favicon.ico' ,'rb' )} r = requests.post(url,files=file)print (r.text)
获取cookie 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 import requests url = 'http://www.zhihu.com' headers = { 'Cookie' :xxx', ' User-Agent':' Mozilla/5.0 (Windows NT 10.0 ; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0 .0 .0 Safari/537.36 ' } r = requests.get(url=url,headers=headers) with open(' 1. html',' wb') as f: f.write(r.text.encode(' utf-8 ')) print(r.cookies) for k,v in r.cookies.items(): print(k + "=" + v)
使用cookiejar保存cookie,并携带cookie访问网页 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 import requests.cookies Cookie = 'xxx' jar = requests.cookies.RequestsCookieJar() headers = { 'User-Agent' :'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36' }for cookie in Cookie.split(';' ): key,value = cookie.split('=' ,1 ) jar.set (key,value) r = requests.get('https://www.zhihu.com' ,cookies=jar,headers=headers)with open ('2.html' ,'wb' ) as f: f.write(r.text.encode('utf-8' ))
session 1 2 3 4 5 6 7 8 import requests r1 = requests.get('http://httpbin.org/cookies/set/number/123456' )print (r1.text) r2 = requests.get('http://httpbin.org/cookies' )print (r2.text)
存在的问题:每次请求独立 第一次设置了cookie之后第二次请求不存在了
使用session:
1 2 3 4 5 6 7 8 9 s = requests.Session() s.get('http://httpbin.org/cookies/set/number/123456' ) r = s.get('http://httpbin.org/cookies' )print (r.text)
ssl证书校验 测试网址:https://ssr2.scrape.center/
缺少证书,需要自己点继续前往的情况
直接请求 报错:
添加verify=False即可绕过
1 2 r = requests.get('https://ssr2.scrape.center/' ,verify=False )print (r.status_code)
超时 timeout参数
1 2 r = requests.get('https://ssr2.scrape.center/' ,verify=False ,timeout=0.1 )print (r.status_code)
身份认证(httpbasicAuth) 1 2 3 4 from requests.auth import HTTPBasicAuth r = requests.get('https://ssr3.scrape.center/' ,verify=False ,auth=HTTPBasicAuth('admin' ,'admin' ))print (r.text)
代理设置 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 from weakref import proxyimport requests proxy_list = { 'http' :'http://代理地址:端口' , 'http' :'http://代理地址:端口' } requests.get('https://www.baidu.com' ,proxies=proxy_list) proxies = { 'http' :'sock5:username:password@host:port' } requests.get('https://www.baidu.com' ,proxies=proxies)
httpx httpx库 爬取使用http2.0的网站
https://spa16.scrape.center/
支持http2.0还需要安装2.0的支持:
pip install “httpx[http2]”
代码中配置:
client = httpx.Client(http2=True)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 import requestsimport httpx url = 'https://spa16.scrape.center/' headers = { "User-Agent" :"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36" } data = { "name" :"handsome r1" } client = httpx.Client(http2=True ) response = client.post('https://www.httpbin.org/post' ,headers=headers,data=data)print (response.text)