BeautifulSoup 是一个强大的 Python 库,专门用于解析和提取 HTML 或 XML 文档中的数据。它在网络爬虫和数据挖掘领域非常流行,能够通过简单的 API 操作复杂的网页结构。

基础示例

from bs4 import BeautifulSoup
import requests
# 获取网页内容
url = '<https://www.baidu.com/>'
response = requests.get(url)
response.encoding = 'utf-8'
# 使用 BeautifulSoup 解析 HTML
soup = BeautifulSoup(response.text, 'lxml')
# 提取标题
title = soup.find('title').get_text()
print(f"网页标题: {title}")

基础示例

import requests
from bs4 import BeautifulSoup

base_url = '<http://www.baidu.com>'
res = requests.get(base_url)    #  发送 GET 请求
res.encoding = 'utf-8'          #修改代码获取内容格式    #utf-8为国际格式,gbk为包含中文的格式,
# 创建 BeautifulSoup类对象
soup = BeautifulSoup(res.text, 'lxml')
print("ALL: ", soup)
# 查找所有 <a> 标签
a_all = soup.find_all('a')
print(' 查找所有 <a> 标签:\\n{}'.format(a_all))
# 查找 href="<http://v.baidu.com>" 的<a> 标签
a_attrs = soup.find_all('a', attrs={'href':'<http://v.baidu.com>'} )
print(' 查找指定属性的<a> 标签:\\n{}'.format(a_attrs))
a_attrs = soup.find_all(href='//map.baidu.com/')
print(' 查找指定属性的<a> 标签:\\n{}'.format(a_attrs))
# 查找文本为地图的<a> 标签
a_string = soup.find_all('a', string='地图')
print(' 查找指定文本的<a> 标签:\\n{}'.format(a_string))

基于BeautifulSoup的小说爬取

import requests
from bs4 import BeautifulSoup

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/96.0.4664.45 Safari/537.36'
}

for i in range(1, 201):
		# 偷香看书网
    base_url = '<https://www.txksw.com/book/S2H7->' + str(i) + '.html'
    res = requests.get(base_url, headers=headers)  # 发送 GET 请求
    res.encoding = 'utf-8'  # 修改代码获取内容格式    #utf-8为国际格式,gbk为包含中文的格式,
    soup = BeautifulSoup(res.text, 'lxml')
    title = soup.find_all('span', attrs={"class": "cur"})[0].string.strip()
    title += "\\n"
    print(title)
    with open("example.txt", "a", encoding="utf-8") as file:
        file.write(title)
    content_all = ''
    contents = soup.find_all('p')
    for content in contents:
        try:
            content_all += content.string + '\\n'
        except:
            continue
    for s in range(2, 10):
        Branch_url = '<https://www.txksw.com/book/S2H7->' + str(i) + '-' + str(s) + '.html'
        res_Branch = requests.get(Branch_url, headers=headers)  # 发送 GET 请求
        res_Branch.encoding = 'utf-8'  # 修改代码获取内容格式    #utf-8为国际格式,gbk为包含中文的格式,
        soup_Branch = BeautifulSoup(res_Branch.text, 'lxml')
        contents_Branch = soup_Branch.find_all('p')
        for content in contents_Branch:
            try:
                content_all += content.string + '\\n'
            except:
                continue
        try:
            judgment = soup_Branch.find_all('a', string="下一页")[0].string
        except:
            print(f"没有第{s + 1}页")
            break
    with open("example.txt", "a", encoding="utf-8") as file_content:
        file_content.write(content_all)