BeautifulSoup基本用法

安装

pip3 install beautifulsoup4

引入

# HTTP 请求库
import urllib.request, urllib.error
from bs4 import BeautifulSoup
# 解决自签证书错误问题
import ssl

获取页面

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'
}
context = ssl._create_unverified_context()
req = urllib.request.Request(url='https://www.qianxiaoduan.com', headers=headers)
html = urllib.request.urlopen(req, context=context)
doc = html.read().decode('utf-8')

获取BeautifulSoup对象

soup = BeautifulSoup(doc, 'html.parser')

匹配元素

返回匹配的所有元素

list = soup.find_all('li')
for item in list:
    print(item)

返回匹配的第一个元素

print(soup.find('li'))

匹配标签

print(soup.find_all('a'))

匹配class

print(soup.find_all('a', 'page-numbers'))

匹配id

print(soup.find_all(id='abc'))

匹配子节点

print(soup.find('ul').find_all('li'))
print(soup.select('ul li'))

获取所需值

获取属性

print(soup.find('a')['href'])

获取内容

单层结构

print(soup.find('li').string)

多层结构

print(soup.find('li').get_text())
print(soup.find('li').text)