beautifulsoup学习

beautiful soup

安装与介绍

1	`pip install bs4`

用于解析html and xml文档
解析器：html.parser、lxml解析器和XML的内置解析器
文档遍历：跟xpath差不多，也是整理成树形结构
搜索：find（） find_all()
修改：增删改查bs4都支持
提取数据
处理特殊字符

解析器：
html.parser（内置）
lxml 速度比较快
xml 速度比较快
html5lib 用的比较少速度慢

使用示例

#导入bs库
from bs4 import BeautifulSoup

#创建一个bs对象，用于解析html文档
soup = BeautifulSoup('<p>hello</p>','lxml')
print(soup.p.string)

#输出：
hello

#导入bs库
from bs4 import BeautifulSoup

html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

soup = BeautifulSoup(html,'lxml')
#使用prettify()
print(soup.prettify())

#结果  bs的解析格式
<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title" name="dromouse">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    <!-- Elsie -->
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>

1.节点选择器

from bs4 import BeautifulSoup

html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

soup = BeautifulSoup(html,'lxml')
#选择html中的title标签
print(soup.title)
#打印标签下的文本内容
print(soup.title.string)
#如果有多个标签，使用标签打印该节点的话只打印第一个
print(soup.p)
#标签类型
print(type(soup.title))
#<class 'bs4.element.Tag'>
#用name属性获取节点的名称
print(soup.title.name)
#获取节点的属性 attrs
print(soup.p.attrs)
print(soup.p.attrs['name'])
print(soup.p['name'])

#结果：
<title>The Dormouse's story</title>
The Dormouse's story
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<class 'bs4.element.Tag'>
title
{'class': ['title'], 'name': 'dromouse'}
dromouse
dromouse

2.tag对象嵌套选择

from bs4 import BeautifulSoup

html="""
<html><head><title>The Dormouse's story</title></head>
<body>
"""

soup = BeautifulSoup(html,'lxml')

#打印html文档里面的title标签
print(soup.head.title)
print(type(soup.head.title))
print(soup.head.title.string)

#输出
<title>The Dormouse's story</title>
<class 'bs4.element.Tag'>
The Dormouse's story

3.关联选择

有时候我们选择的时候不能一步到位，需要先选择某节点，然后选择他的父节点，子节点

子节点：
soup.p.children
子孙节点：
soup.p.descendants
父节点：
soup.a.parent
祖先节点：
soup.a.parents
上一个兄弟节点：
soup.a.previous_sibling
下一个兄弟节点：
soup.a.next_sibling
后面所有的兄弟节点：
soup.a.next_siblings
前面所有的兄弟节点：
soup.a.previous_siblings

父子节点：

from bs4 import BeautifulSoup

html="""
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
                <span>Elsie</span>
            </a>
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
            and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        </p>
        <p class="story">...</p>
"""
soup = BeautifulSoup(html,'lxml')
#输出一个列表，包含p节点的所有子节点,包括文本
#print(soup.p.contents)
#子节点
#print(soup.p.children)
# for i,child in enumerate(soup.p.children):
#     print(child)
#子孙节点
# print(soup.p.descendants)
# for i,child in enumerate(soup.p.children):
#     print(child)

#父节点
#print(soup.a.parent)
#祖先节点
print(soup.a.parents)
print(list(enumerate(soup.a.parents)))

兄弟节点：

from bs4 import BeautifulSoup

html = """
<html>
    <body>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
                <span>Elsie</span>
            </a>
            Hello
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
            and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        </p>
    </body>
"""
soup = BeautifulSoup(html,'lxml')
#上一个兄弟节点：
print(soup.a.previous_sibling)
#下一个兄弟节点：
print(soup.a.next_sibling)
#后面所有的兄弟节点：
print(soup.a.next_siblings)
#前面所有的兄弟节点：
print(soup.a.previous_siblings)

#输出
            Once upon a time there were three little sisters; and their names were

            Hello

<generator object PageElement.next_siblings at 0x000001FE1EBAD270>
<generator object PageElement.previous_siblings at 0x000001FE1EBAD270>

内容与属性的获取：

from bs4 import BeautifulSoup

html = """
<html>
    <body>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">Bob</a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
        </p>
"""
soup = BeautifulSoup(html,'lxml')
print(soup.a.next_sibling.string)

print(soup.a.parents)
print(list(soup.a.parents)[0])
print(list(soup.a.parents)[0].attrs['class'])

#结果
Lacie
<generator object PageElement.parents at 0x0000010FD9CAD200>
<p class="story">
            Once upon a time there were three little sisters; and their names were
            <a class="sister" href="http://example.com/elsie" id="link1">Bob</a><a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
</p>
['story']

4.方法选择器

find_all,find方法

find_all方法基本使用：

from bs4 import BeautifulSoup

html = """
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
"""
soup = BeautifulSoup(html,'lxml')

#查找所有的ul元素
print(soup.find_all(name='ul'))
print(soup.find_all(name='ul')[0])
print(type(soup.find_all(name='ul')[1]))
#遍历所有的ul
for ul in soup.find_all(name='ul'):
    print(type(ul.find_all(name='li')))
    for li in ul.find_all(name='li'):
        print(li.string)
        
#结果：
[<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>, <ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>]
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<class 'bs4.element.Tag'>
<class 'bs4.element.ResultSet'>
Foo
Bar
Jay
<class 'bs4.element.ResultSet'>
Foo
Bar

根据属性：

from bs4 import BeautifulSoup

html = """
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
"""
soup = BeautifulSoup(html,'lxml')
#使用find_all查询所有id属性为list-1的元素
print(soup.find_all(attrs={'id':'list-1'}))

print(soup.find_all(attrs={'class':'element'}))
print(soup.find_all(class_='element'))

#结果：
[<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>]
[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]
[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]

接受正则表达式作为参数：

from bs4 import BeautifulSoup
import re
html = """
<div class="panel">
    <div class="panel-body">
        <a>Hello, this is a link</a>
        <a>Hello, this is a link, too</a>
    </div>
"""
soup = BeautifulSoup(html,'lxml')
#使用正则表达式查询所有带有link标签的元素
found_elements = soup.findAll(text=re.compile('link'))
print(found_elements)
#结果：
['Hello, this is a link', 'Hello, this is a link, too']

find方法使用：

from bs4 import BeautifulSoup

html = """
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
"""
soup = BeautifulSoup(html,'lxml')
ul_tag = soup.find(name='ul')
print(ul_tag)

list_element = soup.find(class_='list list-small')
print(list_element)
#结果：
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>

5.css选择器

调用select方法，传入对应的css

from bs4 import BeautifulSoup

html = """
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
"""
soup = BeautifulSoup(html,'lxml')
print(soup.select('.panel .panel-heading'))

print(soup.select('ul li'))
print(soup.select('#list-2 .element'))
#嵌套选择
for ul in soup.select('ul'):
    print(ul.select('li'))
    print(ul.attrs['id'])
#获取内容
for li in soup.select('li'):
    print(li.string)
    print(li.get_text())
    
#结果：
[<div class="panel-heading">
<h4>Hello</h4>
</div>]
[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]
[<li class="element">Foo</li>, <li class="element">Bar</li>]
[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]
list-1
[<li class="element">Foo</li>, <li class="element">Bar</li>]
list-2
Foo
Foo
Bar
Bar
Jay
Jay
Foo
Foo
Bar
Bar

bs项目：爬取小说

爬取小说

目标网站：https://www.biqukan8.cc/（笔趣看）

逻辑梳理：
1.拿到全部详情页面，和章节名字

https://www.biqukan8.cc/0_790/ 以元尊为例

2.请求详情页面，匹配出小说内容

3.创建一个txt文件，把匹配出来的小说内容保存

步骤

1.请求要爬取的小说页面https://www.biqukan8.cc/0_790/

2.获取到正文卷所有章节的链接和名字

先获取整个div标签，通过bs解析，然后获取bt标签，获取小说名字

soup = BeautifulSoup(response.text,"lxml")
chapters = soup.findAll('div',class_='listmain')
#logging.info(chapters)
download_soup = BeautifulSoup(str(chapters),'lxml')
#logging.info(download_soup.contents)
#获取dt标签 然后获取小说名字
novel_name = str(download_soup.dl.dt).split("》")[0][5:]
#print(str(novel_name).split("》")[0][5:])

需要过滤掉最新章节的内容才开始爬取，避免重复爬取，获取到章节的链接和名字

#需要过滤掉最新章节的内容才开始爬取，避免重复爬取
begin_flag = False
for child in download_soup.dl.children:
    if child != '\n':
        if child.string == u"%s" % flag_name:
            begin_flag = True
        if begin_flag == True and child.a != None:
            download_url = "https://www.biqukan8.cc/"+child.a.get("href")
            download_name = child.a.string 
            print(download_url)
            print(download_name)

3.爬取详情页的小说文本内容

单个章节div的获取：

txt_response = requests.get(url=url_list[100],headers=headers)
txt_soup = BeautifulSoup(str(txt_response.text),"lxml")
txt = txt_soup.find_all(id='content',class_='showtxt')
print(txt)

获取div标签里的所有文字

#获取文本
text_soup = BeautifulSoup(str(txts),"lxml")
text = text_soup.div.text   #获取div标签里所有文字
print(text)

4.将小说按章节读取并且保存到txt文件里

创建存放小说的目录并写入

def file_write(name,text):
    directory_path = local_save_path + novel_name
    if os.path.exists(directory_path):
        print(f"目录:'{directory_path}'存在！")
    else:
        os.mkdir(directory_path)
        print(f"目录:'{directory_path}'已经创建！")
    #将刚刚获取到的小说内容写进去
    name_path = os.path.join(directory_path,f"{name}.txt")
    with open(name_path,'a+',encoding='utf-8') as file:
        file.write(text)
    
file_write('第一百零一章 大战来临',text=text)

最后循环爬取章节url列表中的url即可

整体代码

#!/usr/bin/env python

from bs4 import BeautifulSoup
import requests
import os
import logging
from fake_useragent import UserAgent

#本地存储路径
local_save_path = 'E:/secStudy/pythonProject/beautifulsoup/novel/'

ua = UserAgent()
headers = {
    "User-Agent":ua.random
}
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s-%(levelname)s:%(message)s')
url = 'https://www.biqukan8.cc/0_790/'



def novel_content(url,name):
    #单个章节div的获取：
    txt_response = requests.get(url=url,headers=headers)
    txts_soup = BeautifulSoup(str(txt_response.text),"lxml")
    txts = txts_soup.find_all(id='content',class_='showtxt')
    #print(txts)
    #获取文本
    text_soup = BeautifulSoup(str(txts),"lxml")
    text = text_soup.div.text   #获取div标签里所有文字
    file_write(name,text)
#print(text)
#创建存放小说的目录
def file_write(name,text):
    directory_path = local_save_path + novel_name
    if os.path.exists(directory_path):
        #print(f"目录:'{directory_path}'存在！")
        pass
    else:
        os.mkdir(directory_path)
        print(f"目录:'{directory_path}'已经创建！")
    #将刚刚获取到的小说内容写进去
    write_flag = True
    name_path = os.path.join(directory_path,f"{name}.txt")
    print(f"正在爬取：{name}")
    with open(name_path,'a+',encoding='utf-8') as file:
        for each in text:
            if each == 'h':
                write_flag = False
            if write_flag == True and each != '':
                file.write(each)
                
        file.write('\n\n')
    


response = requests.get(url,headers=headers)
response.encoding = 'gbk'
#logging.info(response.text)
soup = BeautifulSoup(response.text,"lxml")
chapters = soup.findAll('div',class_='listmain')
#logging.info(chapters)
download_soup = BeautifulSoup(str(chapters),'lxml')
#logging.info(download_soup.contents)
#获取dt标签 然后获取小说名字
novel_name = str(download_soup.dl.dt).split("》")[0][5:]
#print(str(novel_name).split("》")[0][5:])
#《元尊》正文卷
flag_name = "《"+novel_name+"》"+"正文卷"
#logging.info(flag_name)

#定义存储章节链接和名字的列表
url_list = []
name_list = []
#需要过滤掉最新章节的内容才开始爬取，避免重复爬取
begin_flag = False
for child in download_soup.dl.children:
    if child != '\n':
        if child.string == u"%s" % flag_name:
            begin_flag = True
        if begin_flag == True and child.a != None:
            download_url = "https://www.biqukan8.cc"+child.a.get("href")
            download_name = child.a.string 
            # print(download_url)
            # print(download_name)
            url_list.append(download_url)
            name_list.append(download_name)

#用zip函数把两个列表合并起来
combined_list = zip(url_list,name_list)
for item1,item2 in combined_list:
    novel_content(item1,item2)

待优化：

1.章节内容存在空格，需要做换行处理

2.把所有章节存到一个txt文件中，进行分章节的处理

python

beautifulsoup学习

http://example.com/2023/11/27/beautifulsoup学习/

作者

发布于

2023年11月27日

许可协议

端口转发相关工具和用法上一篇

xpath学习下一篇