BeautifulSoup基本用法
安装
pip3 install beautifulsoup4
引入
# HTTP 请求库
import urllib.request, urllib.error
from bs4 import BeautifulSoup
# 解决自签证书错误问题
import ssl
获取页面
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'
}
context = ssl._create_unverified_context()
req = urllib.request.Request(url='https://www.qianxiaoduan.com', headers=headers)
html = urllib.request.urlopen(req, context=context)
doc = html.read().decode('utf-8')
获取BeautifulSoup对象
soup = BeautifulSoup(doc, 'html.parser')
匹配元素
返回匹配的所有元素
list = soup.find_all('li')
for item in list:
print(item)
返回匹配的第一个元素
print(soup.find('li'))
匹配标签
print(soup.find_all('a'))
匹配class
print(soup.find_all('a', 'page-numbers'))
匹配id
print(soup.find_all(id='abc'))
匹配子节点
print(soup.find('ul').find_all('li'))
print(soup.select('ul li'))
获取所需值
获取属性
print(soup.find('a')['href'])
获取内容
单层结构
print(soup.find('li').string)
多层结构
print(soup.find('li').get_text())
print(soup.find('li').text)
css三角形 »