beautifulsoup学习

beautiful soup

安装与介绍

1
pip install bs4

用于解析html and xml文档
解析器:html.parser、lxml解析器和XML的内置解析器
文档遍历:跟xpath差不多,也是整理成树形结构
搜索:find() find_all()
修改:增删改查bs4都支持
提取数据
处理特殊字符

解析器:
html.parser(内置)
lxml 速度比较快
xml 速度比较快
html5lib 用的比较少 速度慢

使用示例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#导入bs库
from bs4 import BeautifulSoup

#创建一个bs对象,用于解析html文档
soup = BeautifulSoup('<p>hello</p>','lxml')
print(soup.p.string)

#输出:
hello

#导入bs库
from bs4 import BeautifulSoup

html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

soup = BeautifulSoup(html,'lxml')
#使用prettify()
print(soup.prettify())

#结果 bs的解析格式
<html>
<head>
<title>
The Dormouse's story
</title>
</head>
<body>
<p class="title" name="dromouse">
<b>
The Dormouse's story
</b>
</p>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">
<!-- Elsie -->
</a>
,
<a class="sister" href="http://example.com/lacie" id="link2">
Lacie
</a>
and
<a class="sister" href="http://example.com/tillie" id="link3">
Tillie
</a>
;
and they lived at the bottom of a well.
</p>
<p class="story">
...
</p>
</body>
</html>

1.节点选择器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from bs4 import BeautifulSoup

html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

soup = BeautifulSoup(html,'lxml')
#选择html中的title标签
print(soup.title)
#打印标签下的文本内容
print(soup.title.string)
#如果有多个标签,使用标签打印该节点的话只打印第一个
print(soup.p)
#标签类型
print(type(soup.title))
#<class 'bs4.element.Tag'>
#用name属性获取节点的名称
print(soup.title.name)
#获取节点的属性 attrs
print(soup.p.attrs)
print(soup.p.attrs['name'])
print(soup.p['name'])

#结果:
<title>The Dormouse's story</title>
The Dormouse's story
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<class 'bs4.element.Tag'>
title
{'class': ['title'], 'name': 'dromouse'}
dromouse
dromouse

2.tag对象嵌套选择

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from bs4 import BeautifulSoup

html="""
<html><head><title>The Dormouse's story</title></head>
<body>
"""

soup = BeautifulSoup(html,'lxml')

#打印html文档里面的title标签
print(soup.head.title)
print(type(soup.head.title))
print(soup.head.title.string)

#输出
<title>The Dormouse's story</title>
<class 'bs4.element.Tag'>
The Dormouse's story

3.关联选择

有时候我们选择的时候不能一步到位,需要先选择某节点,然后选择他的 父节点,子节点

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
子节点:
soup.p.children
子孙节点:
soup.p.descendants
父节点:
soup.a.parent
祖先节点:
soup.a.parents
上一个兄弟节点:
soup.a.previous_sibling
下一个兄弟节点:
soup.a.next_sibling
后面所有的兄弟节点:
soup.a.next_siblings
前面所有的兄弟节点:
soup.a.previous_siblings

父子节点:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from bs4 import BeautifulSoup

html="""
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html,'lxml')
#输出一个列表,包含p节点的所有子节点,包括文本
#print(soup.p.contents)
#子节点
#print(soup.p.children)
# for i,child in enumerate(soup.p.children):
# print(child)
#子孙节点
# print(soup.p.descendants)
# for i,child in enumerate(soup.p.children):
# print(child)

#父节点
#print(soup.a.parent)
#祖先节点
print(soup.a.parents)
print(list(enumerate(soup.a.parents)))

兄弟节点:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from bs4 import BeautifulSoup

html = """
<html>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
Hello
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
</body>
"""
soup = BeautifulSoup(html,'lxml')
#上一个兄弟节点:
print(soup.a.previous_sibling)
#下一个兄弟节点:
print(soup.a.next_sibling)
#后面所有的兄弟节点:
print(soup.a.next_siblings)
#前面所有的兄弟节点:
print(soup.a.previous_siblings)

#输出
Once upon a time there were three little sisters; and their names were

Hello

<generator object PageElement.next_siblings at 0x000001FE1EBAD270>
<generator object PageElement.previous_siblings at 0x000001FE1EBAD270>

内容与属性的获取:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from bs4 import BeautifulSoup

html = """
<html>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Bob</a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
</p>
"""
soup = BeautifulSoup(html,'lxml')
print(soup.a.next_sibling.string)

print(soup.a.parents)
print(list(soup.a.parents)[0])
print(list(soup.a.parents)[0].attrs['class'])

#结果
Lacie
<generator object PageElement.parents at 0x0000010FD9CAD200>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Bob</a><a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
</p>
['story']

4.方法选择器

find_all,find方法

find_all方法基本使用:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from bs4 import BeautifulSoup

html = """
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
"""
soup = BeautifulSoup(html,'lxml')

#查找所有的ul元素
print(soup.find_all(name='ul'))
print(soup.find_all(name='ul')[0])
print(type(soup.find_all(name='ul')[1]))
#遍历所有的ul
for ul in soup.find_all(name='ul'):
print(type(ul.find_all(name='li')))
for li in ul.find_all(name='li'):
print(li.string)

#结果:
[<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>, <ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>]
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<class 'bs4.element.Tag'>
<class 'bs4.element.ResultSet'>
Foo
Bar
Jay
<class 'bs4.element.ResultSet'>
Foo
Bar

根据属性:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from bs4 import BeautifulSoup

html = """
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
"""
soup = BeautifulSoup(html,'lxml')
#使用find_all查询所有id属性为list-1的元素
print(soup.find_all(attrs={'id':'list-1'}))

print(soup.find_all(attrs={'class':'element'}))
print(soup.find_all(class_='element'))

#结果:
[<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>]
[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]
[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]

接受正则表达式作为参数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from bs4 import BeautifulSoup
import re
html = """
<div class="panel">
<div class="panel-body">
<a>Hello, this is a link</a>
<a>Hello, this is a link, too</a>
</div>
"""
soup = BeautifulSoup(html,'lxml')
#使用正则表达式查询所有带有link标签的元素
found_elements = soup.findAll(text=re.compile('link'))
print(found_elements)
#结果:
['Hello, this is a link', 'Hello, this is a link, too']

find方法使用:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from bs4 import BeautifulSoup

html = """
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
"""
soup = BeautifulSoup(html,'lxml')
ul_tag = soup.find(name='ul')
print(ul_tag)

list_element = soup.find(class_='list list-small')
print(list_element)
#结果:
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>

5.css选择器

调用select方法,传入对应的css

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from bs4 import BeautifulSoup

html = """
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
"""
soup = BeautifulSoup(html,'lxml')
print(soup.select('.panel .panel-heading'))

print(soup.select('ul li'))
print(soup.select('#list-2 .element'))
#嵌套选择
for ul in soup.select('ul'):
print(ul.select('li'))
print(ul.attrs['id'])
#获取内容
for li in soup.select('li'):
print(li.string)
print(li.get_text())

#结果:
[<div class="panel-heading">
<h4>Hello</h4>
</div>]
[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]
[<li class="element">Foo</li>, <li class="element">Bar</li>]
[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]
list-1
[<li class="element">Foo</li>, <li class="element">Bar</li>]
list-2
Foo
Foo
Bar
Bar
Jay
Jay
Foo
Foo
Bar
Bar

bs项目:爬取小说

爬取小说

目标网站:https://www.biqukan8.cc/(笔趣看)

逻辑梳理:
1.拿到全部详情页面,和章节名字

https://www.biqukan8.cc/0_790/ 以元尊为例

image-20231119194803529

2.请求详情页面,匹配出小说内容

3.创建一个txt文件,把匹配出来的小说内容保存

步骤

1.请求要爬取的小说页面https://www.biqukan8.cc/0_790/

2.获取到正文卷所有章节的链接和名字

image-20231119201322458

先获取整个div标签,通过bs解析,然后获取bt标签,获取小说名字

1
2
3
4
5
6
7
8
soup = BeautifulSoup(response.text,"lxml")
chapters = soup.findAll('div',class_='listmain')
#logging.info(chapters)
download_soup = BeautifulSoup(str(chapters),'lxml')
#logging.info(download_soup.contents)
#获取dt标签 然后获取小说名字
novel_name = str(download_soup.dl.dt).split("》")[0][5:]
#print(str(novel_name).split("》")[0][5:])

需要过滤掉最新章节的内容才开始爬取,避免重复爬取,获取到章节的链接和名字

1
2
3
4
5
6
7
8
9
10
11
#需要过滤掉最新章节的内容才开始爬取,避免重复爬取
begin_flag = False
for child in download_soup.dl.children:
if child != '\n':
if child.string == u"%s" % flag_name:
begin_flag = True
if begin_flag == True and child.a != None:
download_url = "https://www.biqukan8.cc/"+child.a.get("href")
download_name = child.a.string
print(download_url)
print(download_name)

3.爬取详情页的小说文本内容

image-20231119203911843

单个章节div的获取:

1
2
3
4
txt_response = requests.get(url=url_list[100],headers=headers)
txt_soup = BeautifulSoup(str(txt_response.text),"lxml")
txt = txt_soup.find_all(id='content',class_='showtxt')
print(txt)

image-20231119204840703

获取div标签里的所有文字

1
2
3
4
#获取文本
text_soup = BeautifulSoup(str(txts),"lxml")
text = text_soup.div.text #获取div标签里所有文字
print(text)

4.将小说按章节读取并且保存到txt文件里

创建存放小说的目录 并写入

1
2
3
4
5
6
7
8
9
10
11
12
13
def file_write(name,text):
directory_path = local_save_path + novel_name
if os.path.exists(directory_path):
print(f"目录:'{directory_path}'存在!")
else:
os.mkdir(directory_path)
print(f"目录:'{directory_path}'已经创建!")
#将刚刚获取到的小说内容写进去
name_path = os.path.join(directory_path,f"{name}.txt")
with open(name_path,'a+',encoding='utf-8') as file:
file.write(text)

file_write('第一百零一章 大战来临',text=text)

最后循环爬取章节url列表中的url即可

整体代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python

from bs4 import BeautifulSoup
import requests
import os
import logging
from fake_useragent import UserAgent

#本地存储路径
local_save_path = 'E:/secStudy/pythonProject/beautifulsoup/novel/'

ua = UserAgent()
headers = {
"User-Agent":ua.random
}
logging.basicConfig(level=logging.INFO,
format='%(asctime)s-%(levelname)s:%(message)s')
url = 'https://www.biqukan8.cc/0_790/'



def novel_content(url,name):
#单个章节div的获取:
txt_response = requests.get(url=url,headers=headers)
txts_soup = BeautifulSoup(str(txt_response.text),"lxml")
txts = txts_soup.find_all(id='content',class_='showtxt')
#print(txts)
#获取文本
text_soup = BeautifulSoup(str(txts),"lxml")
text = text_soup.div.text #获取div标签里所有文字
file_write(name,text)
#print(text)
#创建存放小说的目录
def file_write(name,text):
directory_path = local_save_path + novel_name
if os.path.exists(directory_path):
#print(f"目录:'{directory_path}'存在!")
pass
else:
os.mkdir(directory_path)
print(f"目录:'{directory_path}'已经创建!")
#将刚刚获取到的小说内容写进去
write_flag = True
name_path = os.path.join(directory_path,f"{name}.txt")
print(f"正在爬取:{name}")
with open(name_path,'a+',encoding='utf-8') as file:
for each in text:
if each == 'h':
write_flag = False
if write_flag == True and each != '':
file.write(each)

file.write('\n\n')



response = requests.get(url,headers=headers)
response.encoding = 'gbk'
#logging.info(response.text)
soup = BeautifulSoup(response.text,"lxml")
chapters = soup.findAll('div',class_='listmain')
#logging.info(chapters)
download_soup = BeautifulSoup(str(chapters),'lxml')
#logging.info(download_soup.contents)
#获取dt标签 然后获取小说名字
novel_name = str(download_soup.dl.dt).split("》")[0][5:]
#print(str(novel_name).split("》")[0][5:])
#《元尊》正文卷
flag_name = "《"+novel_name+"》"+"正文卷"
#logging.info(flag_name)

#定义存储章节链接和名字的列表
url_list = []
name_list = []
#需要过滤掉最新章节的内容才开始爬取,避免重复爬取
begin_flag = False
for child in download_soup.dl.children:
if child != '\n':
if child.string == u"%s" % flag_name:
begin_flag = True
if begin_flag == True and child.a != None:
download_url = "https://www.biqukan8.cc"+child.a.get("href")
download_name = child.a.string
# print(download_url)
# print(download_name)
url_list.append(download_url)
name_list.append(download_name)

#用zip函数把两个列表合并起来
combined_list = zip(url_list,name_list)
for item1,item2 in combined_list:
novel_content(item1,item2)
image-20231119213400492 image-20231119213420651

待优化:

1.章节内容存在空格,需要做换行处理

2.把所有章节存到一个txt文件中,进行分章节的处理


beautifulsoup学习
http://example.com/2023/11/27/beautifulsoup学习/
作者
r1
发布于
2023年11月27日
许可协议