父標籤.子標籤.子標籤.....
## 範例 - 請猜猜以下執行結果
import requests
from bs4 import BeautifulSoup
re = requests.get('https://tw.news.yahoo.com/%E9%A6%99%E6%B8%AF%E9%9B%BB%E5%BD%B1%E9%87%91%E5%83%8F%E7%8D%8E%EF%BC%8F%E5%90%B3%E6%85%B7%E4%BB%81%E3%80%8C%E5%85%A8%E7%B2%B5%E8%AA%9E%E5%8F%97%E8%A8%AA%E3%80%8D%E7%B6%B2%E8%AA%87%E8%AE%9A-%E8%A8%B1%E5%85%89%E6%BC%A2%E5%B8%A5%E7%82%B8%E7%B4%85%E6%AF%AF-104634573.html')
bts = BeautifulSoup(re.text, 'lxml')
print(bts.head.title.string)
print(bts.head.meta['content'])
## 範例 - contents
from bs4 import BeautifulSoup
html_string = """
<div id='d1'>
<span>sub iterator</span>
<ul>
<li>one</li>
<li>two</li>
<li>three</li>
</ul>
</div>
<div id='d2'>
<span>sub iterator</span>
<ul>
<li><span>one</span></li>
<li>two</li>
<li>three</li>
</ul>
</div>
"""
bts = BeautifulSoup(html_string, 'lxml')
tag_d2 = bts.select('#d2')
tag_sub_ul = tag_d2[0].ul
for children in tag_sub_ul.contents:
if isinstance(children, type(tag_sub_ul)):
print(children.string)
one two three
## 範例 - children
from bs4 import BeautifulSoup
html_string = """
<div id='d1'>
<span>sub iterator</span>
<ul>
<li>one</li>
<li>two</li>
<li>three</li>
</ul>
</div>
<div id='d2'>
<span>sub iterator</span>
<ul>
<li><span>one</span></li>
<li>two</li>
<li>three</li>
</ul>
</div>
"""
bts = BeautifulSoup(html_string, 'lxml')
tag_d2 = bts.select('#d2')
tag_sub_ul = tag_d2[0].ul
for children in tag_sub_ul.children:
if isinstance(children, type(tag_sub_ul)):
print(children.string)
one two three
contents、children差異
¶contents與children
實作上毫無差異children
回傳的是一個List Generator
物件## 範例 - descendants
from bs4 import BeautifulSoup
from bs4.element import NavigableString
html_string = """
<div id='d1'>
<span>sub iterator</span>
<ul>
<li>one</li>
<li>two</li>
<li>three</li>
</ul>
</div>
<div id='d2'>
<span>sub iterator</span>
<ul>
<li><span>one</span></li>
<li>two</li>
<li>three</li>
</ul>
</div>
"""
bts = BeautifulSoup(html_string, 'lxml')
tag_d2 = bts.select('#d2')
for children in tag_d2[0].descendants:
if isinstance(children, NavigableString):
if(children!='\n'):
print(children)
sub iterator one two three
403 Forbidden
的錯誤訊息(如下述範例)parser
## 範例
import requests
from bs4 import BeautifulSoup
response = requests.get('https://www.imdb.com/chart/top/')
print(response.text)
<html> <head><title>403 Forbidden</title></head> <body> <center><h1>403 Forbidden</h1></center> </body> </html>
403 Forbidden
解法¶Step 1
:透過開發人員工具-> Network -> Headers -> User-Agent
Step 2
:將查找到的User-Agent
加入requests參數中.如:
requests.get(url, headers={"user-agent":查找到的agent})
## 範例
import requests
from bs4 import BeautifulSoup
response = requests.get('https://www.imdb.com/chart/top/', headers={"user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"})
print(response.text)
parent #屬性
find_parent() #函數
## 範例 - 猜猜以下執行結果
from bs4 import BeautifulSoup
html_string = """
<div id='d1'>
<span>sub iterator 1</span>
<ul>
<li>one</li>
<li>two</li>
<li>three</li>
</ul>
</div>
<div id='d2'>
<span>sub iterator 2</span>
<ul>
<li><span>one</span></li>
<li>two</li>
<li>three</li>
</ul>
</div>
"""
bts = BeautifulSoup(html_string, 'lxml')
tag_li = bts.li
print(tag_li.parent.parent.span.string)
print(tag_li.find_parent().find_parent().span.string)
## 範例
from bs4 import BeautifulSoup
html_string = """
<div id='d1'>
<span>sub iterator 1</span>
<ul>
<li>one</li>
<li>two</li>
<li>three</li>
</ul>
</div>
<div id='d2'>
<span>sub iterator 2</span>
<ul>
<li><span>one</span></li>
<li>two</li>
<li>three</li>
</ul>
</div>
"""
bts = BeautifulSoup(html_string, 'lxml')
sele_ul = bts.ul
#print(sele_ul)
sele_ul_li = sele_ul.li
print(sele_ul_li.next_sibling.next_sibling)
print(sele_ul_li.find_next_sibling())
<li>two</li> <li>two</li>
## 範例 - 猜以下執行結果
from bs4 import BeautifulSoup
html_string = """
<div id='d1'>
<span>sub iterator 1</span>
<ul>
<li>one</li>
<li>two</li>
<li>three</li>
</ul>
</div>
<div id='d2'>
<span>sub iterator 2</span>
<ul>
<li><span>one</span></li>
<li>two</li>
<li>three</li>
</ul>
</div>
"""
bts = BeautifulSoup(html_string, 'lxml')
select_d2 = bts.select('#d2')[0]
#print(select_d2)
select_d2_li = select_d2.li.find_next_sibling()
#print(select_d2_li)
print(select_d2_li.find_previous_sibling())
next_element #屬性
previous_element #屬性
## 範例
from bs4 import BeautifulSoup
html_string = """
<div id='d1'>
<span>sub iterator 1</span>
<ul>
<li>one</li>
<li>two</li>
<li>three</li>
</ul>
</div>
<div id='d2'>
<span>sub iterator 2</span>
<ul>
<li><span>one</span></li>
<li>two</li>
<li>three</li>
</ul>
</div>
"""
bts = BeautifulSoup(html_string, 'lxml')
tag_div = bts.div
print(tag_div.next_element.next_element)
print(tag_div.next_element.next_element.find_next_sibling().next_element.next_element) #猜執行結果
<span>sub iterator 1</span>
next_elements #屬性
previous_elements #屬性
## 範例
from bs4 import BeautifulSoup
from bs4.element import NavigableString
html_string = """
<div id='d1'>
<span>sub iterator 1</span>
<ul>
<li>one</li>
<li>two</li>
<li>three</li>
</ul>
</div>
<div id='d2'>
<span>sub iterator 2</span>
<ul>
<li><span>one</span></li>
<li>two</li>
<li>three</li>
</ul>
</div>
"""
bts = BeautifulSoup(html_string, 'lxml')
tag_d1 = bts.select('#d1')
#print(tag_d1[0])
for i in tag_d1[0].next_elements:
if(isinstance(i, NavigableString)):
if(i!='\n'):
print(i)
sub iterator 1 one two three sub iterator 2 one two three
html
字串,並練習使用previous_elements
並觀察其執行狀況