爬虫学习

urllib

python内置的http请求库

4大模块：

1.request模块：最基本的http请求模块

2.error模块：异常处理模块

3.parse模块：工具模块，提供url的处理方法

4.robotparser模块：识别robot.txt

request

第一个爬虫

#!usr/bin/env python

import urllib.request

#发送一个最基本的请求
def load_baidu_data():
    url = 'http://www.baidu.com/'
    response = urllib.request.urlopen(url)
    #print(response.code)
    data = response.read()
    #将data获取到的东西转换为字符串
    str_data = data.decode('utf-8')
    #print(str_data)
    #将数据写入文件
    with open('baidu.html',"w",encoding="utf-8") as f:
        f.write(str_data)

load_baidu_data()

get请求

#!usr/bin/env python

import urllib.request
import urllib.parse
import string

#发送get型请求  单个参数
def load_baidu_data():
    url = 'https://www.baidu.com/s?wd='
    #url+search
    name = "卢本伟"
    final_url = url+name
    #网址里包含了中文 需要进行转码
    encode_url = urllib.parse.quote(final_url,safe=string.printable)
    print(encode_url)

    response = urllib.request.urlopen(encode_url)
    print(response.code)

load_baidu_data()


#!usr/bin/env python
import urllib.request
import urllib.parse
import string

#发送get型请求  多个参数
def load_baidu_data():
    url = 'https://www.baidu.com/s?'
    #使用params字典
    params = {
        "wd":"孙悟空",
        "pn":"80"
    }
    query_str = urllib.parse.urlencode(params)
    print(query_str)
    final_url = url+query_str
    print(final_url)

    response = urllib.request.urlopen(final_url)
    #print(response.code)
    str_data = response.read().decode('utf-8')
    print(str_data)  
    with open('baidu-wukong-pn80.html',"w",encoding="utf-8") as f:
        f.write(str_data)
load_baidu_data()

post请求

#!usr/bin/env python

import urllib.request
import urllib.parse
import string

#定义url
#用于测试的网站：  http://httpbin.org/post
url = 'http://httpbin.org/post'
#创建要发送的数据表单
data = {
    'hello':'world',
    'handsome':'r1cky'
}
#data要进行编码
data_encode = urllib.parse.urlencode(data).encode('utf-8')

#加上encode('uft-8')  str --> bytes   转换成bytes传输
#加上decode("utf-8") bytes-->str   转换成str传输

#发送post请求
response = urllib.request.urlopen(url=url,data=data_encode)
print(response.read().decode("utf-8"))

超时控制：timeout

#超过0.1秒没有响应就报错
response = urllib.request.urlopen(url=url,data=data_encode,timeout=0.1)

#异常处理：
try:
    response = urllib.request.urlopen(url=url,data=data_encode,timeout=0.1)
    print(response.read().decode("utf-8"))
except urllib.error.URLError as e:
    print("连接超时")

自定义user-agent

爬虫的核心就是模拟用户！！！

#!usr/bin/env python

import urllib.request
import urllib.error
import urllib.parse
import string

#定义url
#用于测试的网站：  http://httpbin.org/post
url = 'http://httpbin.org/post'

#自定义request  添加user-agent
header = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
    "hello":"world"
}

req = urllib.request.Request(url=url,headers=header,method='POST')
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))

随机UA头小项目

每次请求，都切换不同的user-agent

UA头数据网站：UserAgent大全_常用的UserAgent库下载_UserAgent在线工具_流行的UserAgent手机库UA_UA标识网 (kuzhazha.com)

思路：

1.定义多个ua头列表

2.每次请求时切换ua头

#!usr/bin/env python

import urllib.request
import urllib.error
import urllib.parse
import random

def user_agent():
    url = 'http://httpbin.org/post'
    #定义多个user-agent
    user_agent_list = [
        "Mozilla/5.0 (Linux; Android 12; ELS-AN00 Build/HUAWEIELS-AN00; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/107.0.5304.141 Mobile Safari/537.36 XWEB/5075 MMWEBSDK/20230504 MMWEBID/9308 MicroMessenger/8.0.37.2380(0x2800253D) WeChat/arm64 Weixin NetType/5G Language/zh_CN ABI/arm64 MiniProgramEnv/android",
        "Mozilla/5.0 (iPhone; CPU iPhone OS............ile/15E148 MicroMessenger/8.0.34(0x18002234) NetType/4G Language/zh_CN",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; hu; rv:1.8.0.11) Gecko/20070312 Firefox/1.5.0.11",
        "Mozilla/5.0 (Macintosh; Int............ecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67",
        "Mozilla/5.0 (X1............7.6) Gecko/20050318 Firefox/1.0.2",
        "Mozilla/5.0 (Windows; U; Win............o) Chrome/100.0.4896.58 Safari/537.36 UCBrowser/15.4.8.1238",
    ]
    #使用random模块 随机取user_agent_list中的一个 
    random_user_agent = random.choice(user_agent_list)
    req = urllib.request.Request(url=url,method='POST')
    #添加ua头
    req.add_header("User-Agent",random_user_agent)
    response = urllib.request.urlopen(req)
    print(response.read().decode("utf-8"))


user_agent()

自定义opener

handler：
HTTPDefaultErrorHandler 用于处理 HTTP 响应错误，错误都会抛出 HTTPError 类型的异常。
HTTPRedirectHandler 用于处理重定向。
HTTPCookieProcessor 用于处理 Cookies。
ProxyHandler 用于设置代理，默认代理为空。
HTTPPasswordMgr 用于管理密码，它维护了用户名密码的表。
HTTPBasicAuthHandler 用于管理认证，如果一个链接打开时需要认证，那么可以用它来解决认证问题。

#!usr/bin/env python

import urllib.request
import urllib.error
import urllib.parse
import random

def handler_open():
    url = 'http://httpbin.org/get'
    #创建自己的opener
    handler = urllib.request.HTTPHandler()
    opener = urllib.request.build_opener(handler)
    response = opener.open(url)
    print(response.read().decode("utf-8"))

handler_open()

配置代理

代理的类型：
透明：对方可以知道我们的ip
匿名：对方不知道我们的ip，but知道我们使用了代理
高匿：对方不知道我们的ip，and不知道我们使用了代理

免费代理大都不靠谱

#!usr/bin/env python

import urllib.request
import urllib.error
import urllib.parse
import random

def handler_open():
    try:
        url = 'http://httpbin.org/post'
        #添加代理 ip+端口
        proxy = {
            "http":"http://ip:port"

        }
        #创建代理处理器
        proxy_handler = urllib.request.ProxyHandler(proxy)
        #创建自己的opener
        opener = urllib.request.build_opener(proxy_handler)
        response = opener.open(url)
        print(response.read().decode("utf-8"))
    except urllib.error.URLError as e:
        print("error: ",e)


handler_open()

随机代理小项目

代理购买：快代理 - 企业级HTTP代理IP云服务 (kuaidaili.com)

配置代理池：

#!usr/bin/env python

import urllib.request
import urllib.error
import urllib.parse
import random

def proxy_ip():
    url = 'http://httpbin.org/get'
    #定义多个user-agent
    ip_list = [
        "http://111.224.217.xxx:19394",
        "http://114.220.35.xxx:19942",
        "http://117.87.209.xxx:23350",
        "http://121.233.172.xxx:22936",
        "http://182.38.126.xxx:16113"
    ]
    #每次取一个ip 
    proxy = random.choice(ip_list)
    try:
        #创建代理处理器
        proxy_handler = urllib.request.ProxyHandler({'http':proxy,'https':proxy})
        #创建自己的opener
        opener = urllib.request.build_opener(proxy_handler)
        response = opener.open(url)
        print(response.read().decode("utf-8"))
    except urllib.error.URLError as e:
        print("error: ",e)


proxy_ip()

授权认证

身份认证：

HTTPBasicAuthHandler（用户基本的身份验证处理）
HTTPPasswordMgrWithDefaultRealm（经常和authhandler一起出现）

需要认证的测试网站：https://ssr3.scrape.center/ （admin admin）

直接访问：

#创建一个密码管理器
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
#添加进目标url，用户名 密码
password_mgr.add_password(None,url,username,password)

第一个参数为NONE，表示默认的域
如果需要添加不同域的凭证可以将none替换为对应的域名

#!/usr/bin/env python

import urllib.request
from urllib.parse import urlparse

def auth_login():
    url = "https://ssr3.scrape.center/"
    #指定用户名和密码
    username = "admin"
    password = "admin"
    #创建密码管理器
    password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
    #添加目标url   用户名 密码
    password_mgr.add_password(None,url,username,password)
    #创建一个基本密码认证处理器，并将密码管理器传递给他
    handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
    #创建网络请求的构造器
    opener = urllib.request.build_opener(handler)
    response = opener.open(url)
    print(response.code)
    print(response.read().decode('utf-8'))
auth_login()

cookie认证

#!/usr/bin/env python

import urllib.request
from urllib.parse import urlparse

def auth_login():
    url = "https://ssr3.scrape.center/"
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
        #添加认证字段 admin:admin
        "Authorization":"Basic YWRtaW46YWRtaW4="
    }
    req = urllib.request.Request(url=url,headers=headers)
    response = urllib.request.urlopen(req)
    print(response.read().decode('utf-8'))

auth_login()

cookiejar 读写文件

写：

MozillaCookieJar 将cookie写进文件
可以将cookies保存成Mozilla型浏览器的cookies格式

#将cookie写入文件
#!/usr/bin/env python

import urllib.request
import http.cookiejar

filename = 'cookies.txt'
#MozillaCookieJar  将cookie写进文件
#可以将cookies保存成Mozilla型浏览器的cookies格式
cookie = http.cookiejar.MozillaCookieJar(filename=filename)
#保存成 libwww-perl(LWP) 格式cookies文件：
#cookie = http.cookiejar.LWPCookieJar(filename=filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)

response = opener.open("http://www.baidu.com")

# for item in cookie:
#     print(item)

cookie.save(ignore_discard=True,ignore_expires=True)

LWP格式：

读:
LWPCookieJar
保存成 libwww-perl(LWP) 格式cookies文件：

#读取cookie  携带该cookie去访问网页
#!/usr/bin/env python

import urllib.request
import http.cookiejar


filename = 'cookies.txt'
#读：
cookie = http.cookiejar.LWPCookieJar()
cookie.load(filename,ignore_discard=True,ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)

response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))

小项目 – 使用cookie登录网站

测试网站：https://www.yaozh.com/login

#!/usr/bin/env python

import urllib.request
import http.cookiejar
import urllib.parse

#1.使用账号密码登录网站

#登录url：
url = 'https://www.yaozh.com/login'

#登录的参数：
login_data = {
    "type":"0",
    "username":"xxx",
    "pwd":"xxx",
    "pc_vcode":"86_zh-CN",
    "country":"",
    "mobile":"",
    "vcode":"",
    "pincode":"",
    "formhash":"636BCA3396",
    "backurl":"%252F%252Fwww.yaozh.com%252F",    
}
#发送登录请求
cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor()
opener = urllib.request.build_opener(handler)
headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",

    }
#将请求参数转化为bytes
login_str = urllib.parse.urlencode(login_data).encode('utf-8')

req = urllib.request.Request(url=url,headers=headers,data=login_str)
#如果登录成功，cookiejar会自动保存cookie
opener.open(req)

#带着cookie去访问  登录后才能访问的页面
#如果请求成功 则表明cookie保存成功
login_url = "https://www.yaozh.com/member"

req2 = urllib.request.Request(login_url,headers=headers)
response = opener.open(login_url)
data = response.read()

with open('cookie.html',"wb") as f:
    f.write(data)

异常处理

防止程序健壮性较差

urllib的error模块：

urlerror：继承OSError

try:

except error.URLError as e:
    print(e.reason)  #打印错误原因
    
测试：
#!/usr/bin/env python

import urllib
from urllib import request,error

try:
    #定义不存在的路径，并且不带账户密码访问
    url = "https://ssr3.scrape.center/asdasd"
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
    }
    req = urllib.request.Request(url=url,headers=headers)
    response = urllib.request.urlopen(req)
    print(response.read().decode('utf-8'))

except error.URLError as e:
    print(e.reason)

HTTPError

用于处理HTTP请求

#!/usr/bin/env python

import urllib
from urllib.error import *
from urllib import request

try:
    url = "https://ssr3.scrape.center/asdasd"
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
    }
    req = urllib.request.Request(url=url,headers=headers)
    response = urllib.request.urlopen(req)
    print(response.read().decode('utf-8'))

except HTTPError as e:
    print("http error: ",e)

超时异常 — URLError处理

#!/usr/bin/env python

import urllib
from urllib.error import *
import urllib.request
import socket

try:
    # url = "https://ssr3.scrape.center/asdasd"
    # headers = {
    #     "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
    # }
    # req = urllib.request.Request(url=url,headers=headers)
    # response = urllib.request.urlopen(req)
    # print(response.read().decode('utf-8'))
    url = "https://www.baidu.com"
    #设置超时时间为0.01 触发超时错误
    response = urllib.request.urlopen(url=url,timeout=0.01)

#except HTTPError as e:
    #print("http error: ",e)
except URLError as e:
    print(e.reason)
    if isinstance(e.reason,socket.timeout):
        print("Time out!!")

链接的组成和解析

urlparse:python标准库中的一个模块，解析和操作url

标准url链接格式:

scheme://netloc/path;params?query#fragment

scheme（协议） http or https
netloc（网络位置） host
path（路径）
params（携带的参数）
query（查询参数）
fragment（片段）内部导航

#!/usr/bin/env python

from urllib.parse import urlparse

url = 'http://www.baidu.com/index.html;user?id=0#comment'

result = urlparse(url=url)
print(type(result),result)

结果：
<class 'urllib.parse.ParseResult'> 
ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=0', fragment='comment')

urlunparse:与urlparse相反，组合url

from urllib.parse import urlparse, urlunparse
data = ['http','www.baidu.com','index.html','user','id=0','comment']
url = urlunparse(data)
print(url)

结果：
http://www.baidu.com/index.html;user?id=0#comment

urlsplit:与urlparse类似，用于将url参数分割成不同的部分，但不会解析查询参数和片段部分

返回的是一个元组可以用索引取值

from urllib.parse import urlparse, urlunparse,urlsplit
url = 'http://www.baidu.com/index.html;user?id=0#comment'
parts = urlsplit(url)
print(parts)
print(parts.scheme)
print(parts[0])
print(parts.netloc)
print(parts.path)
# print(parts.params)
print(parts.query)
print(parts.fragment)

结果：
SplitResult(scheme='http', netloc='www.baidu.com', path='/index.html;user', query='id=0', fragment='comment')
http
http
www.baidu.com
/index.html;user
id=0
comment

#urlunsplit  组合
data = ('http','www.baidu.com','index.html','id=0','comment')
print(urlunsplit(data))
结果：
http://www.baidu.com/index.html?id=0#comment

urljoin：urljoin()是连接一个基本URL和一个可能的相对URL来形成一个绝对URL

参数：

base 基本url，通常是一个绝对的url

url 这个是相对的url

from urllib.parse import urlparse, urlunparse,urlsplit,urlunsplit,urljoin
base_url = 'https://www.baidu.com'
relative_url = 'path/to/xxx'

url = urljoin(base=base_url,url=relative_url)
print(url)

结果：
https://www.baidu.com/path/to/xxx

注意：base_url 提供了三项内容 scheme、netloc 和 path。如果这 3 项在新的链接里不存在，就予以补充；如果新的链接存在，就使用新的链接的部分。而 base_url 中的 params、query 和 fragment 是不起作用的。

parse_qs

get参数的序列化

parse_qsl

将参数转为元组组成列表

from urllib.parse import urlparse, urlunparse,urlsplit,urlunsplit,urljoin,parse_qs 
query = 'name=handsomer1&age=18'
print(parse_qs(query))
print(parse_qsl(query))

结果：
{'name': ['handsomer1'], 'age': ['18']}
[('name', 'handsomer1'), ('age', '18')]

quote

url编码

unquote

url解码

from urllib.parse import quote, unquote
keyword = "卢本伟"
url = 'https://www.baidu.com/s?wd='+quote(keyword)
print(url)
url_1 = unquote(url)
print(url_1)

结果：
https://www.baidu.com/s?wd=%E5%8D%A2%E6%9C%AC%E4%BC%9F
https://www.baidu.com/s?wd=卢本伟

robotparser模块

用于解析robots.txt

#!/usr/bin/env python

from urllib.robotparser import RobotFileParser

#创建一个RobotFileParser对象用于解析robots.txt
robot_parser = RobotFileParser()
#robot_parser.set_url('https://www.baidu.com/robots.txt')
robot_parser.set_url('https://www.zhihu.com/robots.txt')
#读取并解析robots.txt
robot_parser.read()

#检查是否可以爬取特定的url
user_agent = "BaiduSpider"
#check_url = 'https://www.baidu.com/baidu'
check_url = 'https://www.zhihu.com'

#can_fetch 确定指定的用户代理是否允许访问网页
if robot_parser.can_fetch(user_agent,check_url):
    print("可以爬取此url")
else:
    print("不能爬取此url")

requests库

普通请求 get post put …

#！/usr/bin/env python

import requests

#get
# r = requests.get('http://httpbin.org/get')
# print(r.text)

#添加参数的get请求
# data = {
#     'name':'handsomewuyue',
#     'age':'18'
# }
# r = requests.get('http://httpbin.org/get',params=data)
# print(r.text)

#post
# data = {
#      'name':'handsomewuyue',
#      'age':'18'
# }
# r = requests.post('http://httpbin.org/post',data=data)
# print(r.text)

# r = requests.put('http://httpbin.org/put')
# r = requests.delete('http://httpbin.org/delete')
# r = requests.head('http://httpbin.org/head')
# r = requests.options('http://httpbin.org/options')

headers = {
    'Usrt-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
r = requests.get('https://www.zhihu.com',headers=headers)
print(r.text)

高级用法（文件上传 requestcookiejar session等）

文件上传

import requests

url = 'http://httpbin.org/post'
#上传图片文件
file = {'file':open('favicon.ico','rb')}
r = requests.post(url,files=file)
print(r.text)

获取cookie

#！/usr/bin/env python

import requests

url = 'http://www.zhihu.com'
# r = requests.get(url=url)
# print(r.cookies)
# for k,v in r.cookies.items():
#     print(k + "=" + v)

headers = {
    'Cookie':xxx',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}

r = requests.get(url=url,headers=headers)
with open('1.html','wb') as f:
    f.write(r.text.encode('utf-8'))
    
print(r.cookies)
for k,v in r.cookies.items():
    print(k + "=" + v)

使用cookiejar保存cookie，并携带cookie访问网页

#！/usr/bin/env python

import requests.cookies

Cookie = 'xxx'
#创建一个空的RequestsCookieJar来保存cookie
jar = requests.cookies.RequestsCookieJar()
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}

#遍历分割cookies字符串，将每一个cookie的键值对添加到cookiejar中
for cookie in Cookie.split(';'):
    key,value = cookie.split('=',1)
    jar.set(key,value)

#发送请求并且附带cookies和header
r = requests.get('https://www.zhihu.com',cookies=jar,headers=headers)
with open('2.html','wb') as f:
    f.write(r.text.encode('utf-8'))

session

#!/usr/bin/env python

import requests

r1 = requests.get('http://httpbin.org/cookies/set/number/123456')
print(r1.text)
r2 = requests.get('http://httpbin.org/cookies')
print(r2.text)

存在的问题：每次请求独立第一次设置了cookie之后第二次请求不存在了

使用session：

#1.创建一个session
s = requests.Session()

#2.向网站发送get并且设置cookie
s.get('http://httpbin.org/cookies/set/number/123456')

#3.再次发送请求 获取会话中的cookie
r = s.get('http://httpbin.org/cookies')
print(r.text)

ssl证书校验

测试网址：https://ssr2.scrape.center/

缺少证书，需要自己点继续前往的情况

直接请求报错：

添加verify=False即可绕过

1 2	`r = requests.get('https://ssr2.scrape.center/',verify=False) print(r.status_code)`

超时

timeout参数

1 2	`r = requests.get('https://ssr2.scrape.center/',verify=False,timeout=0.1) print(r.status_code)`

身份认证（httpbasicAuth）

from requests.auth import HTTPBasicAuth
#使用HTTPBasicAuth添加用户密码
r = requests.get('https://ssr3.scrape.center/',verify=False,auth=HTTPBasicAuth('admin','admin'))
print(r.text)

代理设置

#!/usr/bin/env python
from weakref import proxy
import requests
#http代理
proxy_list = {
    'http':'http://代理地址:端口',
    'http':'http://代理地址:端口'
}

requests.get('https://www.baidu.com',proxies=proxy_list)

#socks代理
proxies = {
    'http':'sock5:username:password@host:port'
}
requests.get('https://www.baidu.com',proxies=proxies)

httpx

httpx库爬取使用http2.0的网站

https://spa16.scrape.center/

支持http2.0还需要安装2.0的支持：

pip install “httpx[http2]”

代码中配置：

client = httpx.Client(http2=True)

#!/usr/bin/env python3

import requests
import httpx

url = 'https://spa16.scrape.center/'	#使用http2
headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}
data = {
    "name":"handsome r1"
}
client = httpx.Client(http2=True)
#response = client.get(url,headers=headers)
#post请求
response = client.post('https://www.httpbin.org/post',headers=headers,data=data)

print(response.text)

python

爬虫学习

http://example.com/2023/10/07/爬虫学习/

作者

发布于

2023年10月7日

许可协议

爬虫小项目1 上一篇

ssh暴力破解阻断下一篇