html=""" <html><head><title>The Dormouse's story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """
#结果 bs的解析格式 <html> <head> <title> The Dormouse's story </title> </head> <body> <p class="title" name="dromouse"> <b> The Dormouse's story </b> </p> <p class="story"> Once upon a time there were three little sisters; and their names were <a class="sister" href="http://example.com/elsie"id="link1"> <!-- Elsie --> </a> , <a class="sister" href="http://example.com/lacie"id="link2"> Lacie </a> and <a class="sister" href="http://example.com/tillie"id="link3"> Tillie </a> ; and they lived at the bottom of a well. </p> <p class="story"> ... </p> </body> </html>
html=""" <html><head><title>The Dormouse's story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """
html=""" <html> <head> <title>The Dormouse's story</title> </head> <body> <p class="story"> Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"> <span>Elsie</span> </a> <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a> and they lived at the bottom of a well. </p> <p class="story">...</p> """ soup = BeautifulSoup(html,'lxml') #输出一个列表,包含p节点的所有子节点,包括文本 #print(soup.p.contents) #子节点 #print(soup.p.children) # for i,child in enumerate(soup.p.children): # print(child) #子孙节点 # print(soup.p.descendants) # for i,child in enumerate(soup.p.children): # print(child)
html = """ <html> <body> <p class="story"> Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"> <span>Elsie</span> </a> Hello <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a> and they lived at the bottom of a well. </p> </body> """ soup = BeautifulSoup(html,'lxml') #上一个兄弟节点: print(soup.a.previous_sibling) #下一个兄弟节点: print(soup.a.next_sibling) #后面所有的兄弟节点: print(soup.a.next_siblings) #前面所有的兄弟节点: print(soup.a.previous_siblings)
#输出 Once upon a time there were three little sisters; and their names were
Hello
<generator object PageElement.next_siblings at 0x000001FE1EBAD270> <generator object PageElement.previous_siblings at 0x000001FE1EBAD270>
html = """ <html> <body> <p class="story"> Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Bob</a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> </p> """ soup = BeautifulSoup(html,'lxml') print(soup.a.next_sibling.string)
#结果 Lacie <generator object PageElement.parents at 0x0000010FD9CAD200> <p class="story"> Once upon a time there were three little sisters; and their names were <a class="sister" href="http://example.com/elsie"id="link1">Bob</a><a class="sister" href="http://example.com/lacie"id="link2">Lacie</a> </p> ['story']
#查找所有的ul元素 print(soup.find_all(name='ul')) print(soup.find_all(name='ul')[0]) print(type(soup.find_all(name='ul')[1])) #遍历所有的ul for ul in soup.find_all(name='ul'): print(type(ul.find_all(name='li'))) for li in ul.find_all(name='li'): print(li.string)
from bs4 import BeautifulSoup import re html = """ <div class="panel"> <div class="panel-body"> <a>Hello, this is a link</a> <a>Hello, this is a link, too</a> </div> """ soup = BeautifulSoup(html,'lxml') #使用正则表达式查询所有带有link标签的元素 found_elements = soup.findAll(text=re.compile('link')) print(found_elements) #结果: ['Hello, this is a link', 'Hello, this is a link, too']
print(soup.select('ul li')) print(soup.select('#list-2 .element')) #嵌套选择 for ul in soup.select('ul'): print(ul.select('li')) print(ul.attrs['id']) #获取内容 for li in soup.select('li'): print(li.string) print(li.get_text())
#结果: [<div class="panel-heading"> <h4>Hello</h4> </div>] [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>] [<li class="element">Foo</li>, <li class="element">Bar</li>] [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>] list-1 [<li class="element">Foo</li>, <li class="element">Bar</li>] list-2 Foo Foo Bar Bar Jay Jay Foo Foo Bar Bar